/*
 * Copyright (c) 2023, Alibaba Group Holding Limited;
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free
// queue. An overview, including benchmark results, is provided here:
//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
// The full design is also described in excruciating detail at:
//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue

// Simplified BSD license:
// Copyright (c) 2013-2020, Cameron Desrochers.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// - Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
// - Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

// Also dual-licensed under the Boost Software License (see LICENSE.md)

#pragma once

#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
// Traits::index_t are set to < 32 bits, causing integer promotion, causing
// warnings upon assigning any computed values)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"

#ifdef MCDBGQ_USE_RELACY
#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
#endif
#endif

#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
// VS2019 with /W4 warns about constant conditional expressions but unless
// /std=c++17 or higher does not support `if constexpr`, so we have no choice
// but to simply disable the warning
#pragma warning(push)
#pragma warning(disable : 4127)  // conditional expression is constant
#endif

#if defined(__APPLE__)
#include "TargetConditionals.h"
#endif

#ifdef MCDBGQ_USE_RELACY
#include "relacy/relacy_std.hpp"
#include "relacy_shims.h"
// We only use malloc/free anyway, and the delete macro messes up `= delete`
// method declarations. We'll override the default trait malloc ourselves
// without a macro.
#undef new
#undef delete
#undef malloc
#undef free
#else
#include <atomic>  // Requires C++11. Sorry VS2010.
#include <cassert>
#endif
#include <algorithm>
#include <array>
#include <climits>  // for CHAR_BIT
#include <cstddef>  // for max_align_t
#include <cstdint>
#include <cstdlib>
#include <limits>
#include <mutex>  // used for thread exit synchronization
#include <thread>  // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
#include <type_traits>
#include <utility>

// Platform-specific definitions of a numeric thread ID type and an invalid
// value
namespace ylt::detail::moodycamel {
namespace details {
template <typename thread_id_t>
struct thread_id_converter {
  typedef thread_id_t thread_id_numeric_size_t;
  typedef thread_id_t thread_id_hash_t;
  static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
};
}  // namespace details
}  // namespace ylt::detail::moodycamel
#if defined(MCDBGQ_USE_RELACY)
namespace ylt::detail::moodycamel {
namespace details {
typedef std::uint32_t thread_id_t;
static const thread_id_t invalid_thread_id = 0xFFFFFFFFU;
static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
static inline thread_id_t thread_id() { return rl::thread_index(); }
}  // namespace details
}  // namespace ylt::detail::moodycamel
#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
// No sense pulling in windows.h in a header, we'll manually declare the
// function we use and rely on backwards-compatibility for this not to break
extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(
    void);
namespace ylt::detail::moodycamel {
namespace details {
static_assert(sizeof(unsigned long) == sizeof(std::uint32_t),
              "Expected size of unsigned long to be 32 bits on Windows");
typedef std::uint32_t thread_id_t;
static const thread_id_t invalid_thread_id =
    0;  // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
static const thread_id_t invalid_thread_id2 =
    0xFFFFFFFFU;  // Not technically guaranteed to be invalid, but is never used
                  // in practice. Note that all Win32 thread IDs are presently
                  // multiples of 4.
static inline thread_id_t thread_id() {
  return static_cast<thread_id_t>(::GetCurrentThreadId());
}
}  // namespace details
}  // namespace ylt::detail::moodycamel
#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
    (defined(__APPLE__) && TARGET_OS_IPHONE) ||                      \
    defined(MOODYCAMEL_NO_THREAD_LOCAL)
namespace ylt::detail::moodycamel {
namespace details {
static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8,
              "std::thread::id is expected to be either 4 or 8 bytes");

typedef std::thread::id thread_id_t;
static const thread_id_t invalid_thread_id;  // Default ctor creates invalid ID

// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have
// one; it's only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined
// anyway, which it won't be.
static inline thread_id_t thread_id() { return std::this_thread::get_id(); }

template <std::size_t>
struct thread_id_size {};
template <>
struct thread_id_size<4> {
  typedef std::uint32_t numeric_t;
};
template <>
struct thread_id_size<8> {
  typedef std::uint64_t numeric_t;
};

template <>
struct thread_id_converter<thread_id_t> {
  typedef thread_id_size<sizeof(thread_id_t)>::numeric_t
      thread_id_numeric_size_t;
#ifndef __APPLE__
  typedef std::size_t thread_id_hash_t;
#else
  typedef thread_id_numeric_size_t thread_id_hash_t;
#endif

  static thread_id_hash_t prehash(thread_id_t const& x) {
#ifndef __APPLE__
    return std::hash<std::thread::id>()(x);
#else
    return *reinterpret_cast<thread_id_hash_t const*>(&x);
#endif
  }
};
}
}
#else
// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
// In order to get a numeric thread ID in a platform-independent way, we use a
// thread-local static variable's address as a thread identifier :-)
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define MOODYCAMEL_THREADLOCAL __thread
#elif defined(_MSC_VER)
#define MOODYCAMEL_THREADLOCAL __declspec(thread)
#else
// Assume C++11 compliant compiler
#define MOODYCAMEL_THREADLOCAL thread_local
#endif
namespace ylt::detail::moodycamel {
namespace details {
typedef std::uintptr_t thread_id_t;
static const thread_id_t invalid_thread_id = 0;  // Address can't be nullptr
static const thread_id_t invalid_thread_id2 =
    1;  // Member accesses off a null pointer are also generally invalid. Plus
        // it's not aligned.
inline thread_id_t thread_id() {
  static MOODYCAMEL_THREADLOCAL int x;
  return reinterpret_cast<thread_id_t>(&x);
}
}
}
#endif

// Constexpr if
#ifndef MOODYCAMEL_CONSTEXPR_IF
#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || \
    __cplusplus > 201402L
#define MOODYCAMEL_CONSTEXPR_IF if constexpr
#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
#else
#define MOODYCAMEL_CONSTEXPR_IF if
#define MOODYCAMEL_MAYBE_UNUSED
#endif
#endif

// Exceptions
#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
#if (defined(_MSC_VER) && defined(_CPPUNWIND)) ||   \
    (defined(__GNUC__) && defined(__EXCEPTIONS)) || \
    (!defined(_MSC_VER) && !defined(__GNUC__))
#define MOODYCAMEL_EXCEPTIONS_ENABLED
#endif
#endif
#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
#define MOODYCAMEL_TRY try
#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__)
#define MOODYCAMEL_RETHROW throw
#define MOODYCAMEL_THROW(expr) throw(expr)
#else
#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true)
#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false)
#define MOODYCAMEL_RETHROW
#define MOODYCAMEL_THROW(expr)
#endif

#ifndef MOODYCAMEL_NOEXCEPT
#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
#define MOODYCAMEL_NOEXCEPT
#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when
// it shouldn't :-( We have to assume *all* non-trivial constructors may throw
// on VS2012!
#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)    \
  (std::is_rvalue_reference<valueType>::value &&           \
           std::is_move_constructible<type>::value         \
       ? std::is_trivially_move_constructible<type>::value \
       : std::is_trivially_copy_constructible<type>::value)
#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)      \
  ((std::is_rvalue_reference<valueType>::value &&              \
            std::is_move_assignable<type>::value               \
        ? std::is_trivially_move_assignable<type>::value ||    \
              std::is_nothrow_move_assignable<type>::value     \
        : std::is_trivially_copy_assignable<type>::value ||    \
              std::is_nothrow_copy_assignable<type>::value) && \
   MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)       \
  (std::is_rvalue_reference<valueType>::value &&              \
           std::is_move_constructible<type>::value            \
       ? std::is_trivially_move_constructible<type>::value || \
             std::is_nothrow_move_constructible<type>::value  \
       : std::is_trivially_copy_constructible<type>::value || \
             std::is_nothrow_copy_constructible<type>::value)
#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)      \
  ((std::is_rvalue_reference<valueType>::value &&              \
            std::is_move_assignable<type>::value               \
        ? std::is_trivially_move_assignable<type>::value ||    \
              std::is_nothrow_move_assignable<type>::value     \
        : std::is_trivially_copy_assignable<type>::value ||    \
              std::is_nothrow_copy_assignable<type>::value) && \
   MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
#else
#define MOODYCAMEL_NOEXCEPT noexcept
#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
#endif
#endif

#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
#ifdef MCDBGQ_USE_RELACY
#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
#else
// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a
// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't
// support thread_local either. Finally, iOS/ARM doesn't have support for it
// either, and g++/ARM allows it to compile but it's unconfirmed to actually
// work
#if (!defined(_MSC_VER) || _MSC_VER >= 1900) &&                        \
    (!defined(__MINGW32__) && !defined(__MINGW64__) ||                 \
     !defined(__WINPTHREADS_VERSION)) &&                               \
    (!defined(__GNUC__) || __GNUC__ > 4 ||                             \
     (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) &&                        \
    (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \
    !defined(_M_ARM) && !defined(__aarch64__)
// Assume `thread_local` is fully supported in all other C++11
// compilers/platforms
#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED  // tentatively enabled for now;
                                                 // years ago several users
                                                 // report having problems with
                                                 // it on
#endif
#endif
#endif

// VS2012 doesn't support deleted functions.
// In this case, we declare the function normally but don't define it. A link
// error will be generated if the function is called.
#ifndef MOODYCAMEL_DELETE_FUNCTION
#if defined(_MSC_VER) && _MSC_VER < 1800
#define MOODYCAMEL_DELETE_FUNCTION
#else
#define MOODYCAMEL_DELETE_FUNCTION = delete
#endif
#endif

namespace ylt::detail::moodycamel {
namespace details {
#ifndef MOODYCAMEL_ALIGNAS
// VS2013 doesn't support alignas or alignof, and align() requires a constant
// literal
#if defined(_MSC_VER) && _MSC_VER <= 1800
#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \
  typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
template <int Align, typename T>
struct Vs2013Aligned {};  // default, unsupported alignment
template <typename T>
struct Vs2013Aligned<1, T> {
  typedef __declspec(align(1)) T type;
};
template <typename T>
struct Vs2013Aligned<2, T> {
  typedef __declspec(align(2)) T type;
};
template <typename T>
struct Vs2013Aligned<4, T> {
  typedef __declspec(align(4)) T type;
};
template <typename T>
struct Vs2013Aligned<8, T> {
  typedef __declspec(align(8)) T type;
};
template <typename T>
struct Vs2013Aligned<16, T> {
  typedef __declspec(align(16)) T type;
};
template <typename T>
struct Vs2013Aligned<32, T> {
  typedef __declspec(align(32)) T type;
};
template <typename T>
struct Vs2013Aligned<64, T> {
  typedef __declspec(align(64)) T type;
};
template <typename T>
struct Vs2013Aligned<128, T> {
  typedef __declspec(align(128)) T type;
};
template <typename T>
struct Vs2013Aligned<256, T> {
  typedef __declspec(align(256)) T type;
};
#else
template <typename T>
struct identity {
  typedef T type;
};
#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \
  alignas(alignof(obj)) typename details::identity<T>::type
#endif
#endif
}  // namespace details
}  // namespace ylt::detail::moodycamel

// TSAN can false report races in lock-free code.  To enable TSAN to be used
// from projects that use this one, we can apply per-function compile-time
// suppression. See
// https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
#define MOODYCAMEL_NO_TSAN
#if defined(__has_feature)
#if __has_feature(thread_sanitizer)
#undef MOODYCAMEL_NO_TSAN
#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
#endif  // TSAN
#endif  // TSAN

// Compiler-specific likely/unlikely hints
namespace ylt::detail::moodycamel {
namespace details {
#if defined(__GNUC__)
static inline bool(likely)(bool x) { return __builtin_expect((x), true); }
static inline bool(unlikely)(bool x) { return __builtin_expect((x), false); }
#else
static inline bool(likely)(bool x) { return x; }
static inline bool(unlikely)(bool x) { return x; }
#endif
}  // namespace details
}  // namespace ylt::detail::moodycamel

#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
#include "internal/concurrentqueue_internal_debug.h"
#endif

namespace ylt::detail::moodycamel {
namespace details {
template <typename T>
struct const_numeric_max {
  static_assert(std::is_integral<T>::value,
                "const_numeric_max can only be used with integers");
  static const T value =
      std::numeric_limits<T>::is_signed
          ? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) -
                static_cast<T>(1)
          : static_cast<T>(-1);
};

#if defined(__GLIBCXX__)
typedef ::max_align_t
    std_max_align_t;  // libstdc++ forgot to add it to std:: for a while
#else
typedef std::max_align_t std_max_align_t;  // Others (e.g. MSVC) insist it can
                                           // *only* be accessed via std::
#endif

// Some platforms have incorrectly set max_align_t to a type with <8 bytes
// alignment even while supporting 8-byte aligned scalar values (*cough* 32-bit
// iOS). Work around this with our own union. See issue #64.
typedef union {
  std_max_align_t x;
  long long y;
  void* z;
} max_align_t;
}  // namespace details

// Default traits for the ConcurrentQueue. To change some of the
// traits without re-implementing all of them, inherit from this
// struct and shadow the declarations you wish to be different;
// since the traits are used as a template type parameter, the
// shadowed declarations will be used where defined, and the defaults
// otherwise.
struct ConcurrentQueueDefaultTraits {
  // General-purpose size type. std::size_t is strongly recommended.
  typedef std::size_t size_t;

  // The type used for the enqueue and dequeue indices. Must be at least as
  // large as size_t. Should be significantly larger than the number of elements
  // you expect to hold at once, especially if you have a high turnover rate;
  // for example, on 32-bit x86, if you expect to have over a hundred million
  // elements or pump several million elements through your queue in a very
  // short space of time, using a 32-bit type *may* trigger a race condition.
  // A 64-bit int type is recommended in that case, and in practice will
  // prevent a race condition no matter the usage of the queue. Note that
  // whether the queue is lock-free with a 64-int type depends on the whether
  // std::atomic<std::uint64_t> is lock-free, which is platform-specific.
  typedef std::size_t index_t;

  // Internally, all elements are enqueued and dequeued from multi-element
  // blocks; this is the smallest controllable unit. If you expect few elements
  // but many producers, a smaller block size should be favoured. For few
  // producers and/or many elements, a larger block size is preferred. A sane
  // default is provided. Must be a power of 2.
  static const size_t QUEUE_BLOCK_SIZE = 32;

  // For explicit producers (i.e. when using a producer token), the block is
  // checked for being empty by iterating through a list of flags, one per
  // element. For large block sizes, this is too inefficient, and switching to
  // an atomic counter-based approach is faster. The switch is made for block
  // sizes strictly larger than this threshold.
  static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;

  // How many full blocks can be expected for a single explicit producer? This
  // should reflect that number's maximum for optimal performance. Must be a
  // power of 2.
  static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;

  // How many full blocks can be expected for a single implicit producer? This
  // should reflect that number's maximum for optimal performance. Must be a
  // power of 2.
  static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;

  // The initial size of the hash table mapping thread IDs to implicit
  // producers. Note that the hash is resized every time it becomes half full.
  // Must be a power of two, and either 0 or at least 1. If 0, implicit
  // production (using the enqueue methods without an explicit producer token)
  // is disabled.
  static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;

  // Controls the number of items that an explicit consumer (i.e. one with a
  // token) must consume before it causes all consumers to rotate and move on to
  // the next internal queue.
  static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
      256;

  // The maximum number of elements (inclusive) that can be enqueued to a
  // sub-queue. Enqueue operations that would cause this limit to be surpassed
  // will fail. Note that this limit is enforced at the block level (for
  // performance reasons), i.e. it's rounded up to the nearest block size.
  static const size_t MAX_SUBQUEUE_SIZE =
      details::const_numeric_max<size_t>::value;

  // The number of times to spin before sleeping when waiting on a semaphore.
  // Recommended values are on the order of 1000-10000 unless the number of
  // consumer threads exceeds the number of idle cores (in which case try
  // 0-100). Only affects instances of the BlockingConcurrentQueue.
  static const int MAX_SEMA_SPINS = 10000;

  // Whether to recycle dynamically-allocated blocks into an internal free list
  // or not. If false, only pre-allocated blocks (controlled by the constructor
  // arguments) will be recycled, and all others will be `free`d back to the
  // heap. Note that blocks consumed by explicit producers are only freed on
  // destruction of the queue (not following destruction of the token)
  // regardless of this trait.
  static const bool RECYCLE_ALLOCATED_BLOCKS = false;

#ifndef MCDBGQ_USE_RELACY
  // Memory allocation can be customized if needed.
  // malloc should return nullptr on failure, and handle alignment like
  // std::malloc.
#if defined(malloc) || defined(free)
  // Gah, this is 2015, stop defining macros that break standard code already!
  // Work around malloc/free being special macros:
  static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
  static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
  static inline void*(malloc)(size_t size) { return WORKAROUND_malloc(size); }
  static inline void(free)(void* ptr) { return WORKAROUND_free(ptr); }
#else
  static inline void* malloc(size_t size) { return std::malloc(size); }
  static inline void free(void* ptr) { return std::free(ptr); }
#endif
#else
  // Debug versions when running under the Relacy race detector (ignore
  // these in user code)
  static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
  static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
#endif
};

// When producing or consuming many elements, the most efficient way is to:
//    1) Use one of the bulk-operation methods of the queue with a token
//    2) Failing that, use the bulk-operation methods without a token
//    3) Failing that, create a token and use that with the single-item methods
//    4) Failing that, use the single-parameter methods of the queue
// Having said that, don't create tokens willy-nilly -- ideally there should be
// a maximum of one token per thread (of each kind).
struct ProducerToken;
struct ConsumerToken;

template <typename T, typename Traits>
class ConcurrentQueue;
template <typename T, typename Traits>
class BlockingConcurrentQueue;
class ConcurrentQueueTests;

namespace details {
struct ConcurrentQueueProducerTypelessBase {
  ConcurrentQueueProducerTypelessBase* next;
  std::atomic<bool> inactive;
  ProducerToken* token;

  ConcurrentQueueProducerTypelessBase()
      : next(nullptr), inactive(false), token(nullptr) {}
};

template <bool use32>
struct _hash_32_or_64 {
  static inline std::uint32_t hash(std::uint32_t h) {
    // MurmurHash3 finalizer -- see
    // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
    // Since the thread ID is already unique, all we really want to do is
    // propagate that uniqueness evenly across all the bits, so that we can use
    // a subset of the bits while reducing collisions significantly
    h ^= h >> 16;
    h *= 0x85ebca6b;
    h ^= h >> 13;
    h *= 0xc2b2ae35;
    return h ^ (h >> 16);
  }
};
template <>
struct _hash_32_or_64<1> {
  static inline std::uint64_t hash(std::uint64_t h) {
    h ^= h >> 33;
    h *= 0xff51afd7ed558ccd;
    h ^= h >> 33;
    h *= 0xc4ceb9fe1a85ec53;
    return h ^ (h >> 33);
  }
};
template <std::size_t size>
struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {};

static inline size_t hash_thread_id(thread_id_t id) {
  static_assert(
      sizeof(thread_id_t) <= 8,
      "Expected a platform where thread IDs are at most 64-bit values");
  return static_cast<size_t>(
      hash_32_or_64<sizeof(
          thread_id_converter<thread_id_t>::thread_id_hash_t)>::
          hash(thread_id_converter<thread_id_t>::prehash(id)));
}

template <typename T>
static inline bool circular_less_than(T a, T b) {
  static_assert(
      std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
      "circular_less_than is intended to be used only with unsigned integer "
      "types");
  return static_cast<T>(a - b) >
         static_cast<T>(static_cast<T>(1)
                        << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
  // Note: extra parens around rhs of operator<< is MSVC bug:
  // https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
  //       silencing the bug requires #pragma warning(disable: 4554) around the
  //       calling code and has no effect when done here.
}

template <typename U>
static inline char* align_for(char* ptr) {
  const std::size_t alignment = std::alignment_of<U>::value;
  return ptr +
         (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) %
             alignment;
}

template <typename T>
static inline T ceil_to_pow_2(T x) {
  static_assert(
      std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
      "ceil_to_pow_2 is intended to be used only with unsigned integer types");

  // Adapted from
  // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
  --x;
  x |= x >> 1;
  x |= x >> 2;
  x |= x >> 4;
  for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
    x |= x >> (i << 3);
  }
  ++x;
  return x;
}

template <typename T>
static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right) {
  T temp = std::move(left.load(std::memory_order_relaxed));
  left.store(std::move(right.load(std::memory_order_relaxed)),
             std::memory_order_relaxed);
  right.store(std::move(temp), std::memory_order_relaxed);
}

template <typename T>
static inline T const& nomove(T const& x) {
  return x;
}

template <bool Enable>
struct nomove_if {
  template <typename T>
  static inline T const& eval(T const& x) {
    return x;
  }
};

template <>
struct nomove_if<false> {
  template <typename U>
  static inline auto eval(U&& x) -> decltype(std::forward<U>(x)) {
    return std::forward<U>(x);
  }
};

template <typename It>
static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) {
  return *it;
}

#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \
    (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
template <typename T>
struct is_trivially_destructible : std::is_trivially_destructible<T> {};
#else
template <typename T>
struct is_trivially_destructible : std::has_trivial_destructor<T> {};
#endif

#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
#ifdef MCDBGQ_USE_RELACY
typedef RelacyThreadExitListener ThreadExitListener;
typedef RelacyThreadExitNotifier ThreadExitNotifier;
#else
class ThreadExitNotifier;

struct ThreadExitListener {
  typedef void (*callback_t)(void*);
  callback_t callback;
  void* userData;

  ThreadExitListener* next;   // reserved for use by the ThreadExitNotifier
  ThreadExitNotifier* chain;  // reserved for use by the ThreadExitNotifier
};

class ThreadExitNotifier {
 public:
  static void subscribe(ThreadExitListener* listener) {
    auto& tlsInst = instance();
    std::lock_guard<std::mutex> guard(mutex());
    listener->next = tlsInst.tail;
    listener->chain = &tlsInst;
    tlsInst.tail = listener;
  }

  static void unsubscribe(ThreadExitListener* listener) {
    std::lock_guard<std::mutex> guard(mutex());
    if (!listener->chain) {
      return;  // race with ~ThreadExitNotifier
    }
    auto& tlsInst = *listener->chain;
    listener->chain = nullptr;
    ThreadExitListener** prev = &tlsInst.tail;
    for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
      if (ptr == listener) {
        *prev = ptr->next;
        break;
      }
      prev = &ptr->next;
    }
  }

 private:
  ThreadExitNotifier() : tail(nullptr) {}
  ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
  ThreadExitNotifier& operator=(ThreadExitNotifier const&)
      MOODYCAMEL_DELETE_FUNCTION;

  ~ThreadExitNotifier() {
    // This thread is about to exit, let everyone know!
    assert(this == &instance() &&
           "If this assert fails, you likely have a buggy compiler! Change the "
           "preprocessor conditions such that "
           "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
    std::lock_guard<std::mutex> guard(mutex());
    for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
      ptr->chain = nullptr;
      ptr->callback(ptr->userData);
    }
  }

  // Thread-local
  static inline ThreadExitNotifier& instance() {
    static thread_local ThreadExitNotifier notifier;
    return notifier;
  }

  static inline std::mutex& mutex() {
    // Must be static because the ThreadExitNotifier could be destroyed while
    // unsubscribe is called
    static std::mutex mutex;
    return mutex;
  }

 private:
  ThreadExitListener* tail;
};
#endif
#endif

template <typename T>
struct static_is_lock_free_num {
  enum { value = 0 };
};
template <>
struct static_is_lock_free_num<signed char> {
  enum { value = ATOMIC_CHAR_LOCK_FREE };
};
template <>
struct static_is_lock_free_num<short> {
  enum { value = ATOMIC_SHORT_LOCK_FREE };
};
template <>
struct static_is_lock_free_num<int> {
  enum { value = ATOMIC_INT_LOCK_FREE };
};
template <>
struct static_is_lock_free_num<long> {
  enum { value = ATOMIC_LONG_LOCK_FREE };
};
template <>
struct static_is_lock_free_num<long long> {
  enum { value = ATOMIC_LLONG_LOCK_FREE };
};
template <typename T>
struct static_is_lock_free
    : static_is_lock_free_num<typename std::make_signed<T>::type> {};
template <>
struct static_is_lock_free<bool> {
  enum { value = ATOMIC_BOOL_LOCK_FREE };
};
template <typename U>
struct static_is_lock_free<U*> {
  enum { value = ATOMIC_POINTER_LOCK_FREE };
};
}  // namespace details

struct ProducerToken {
  template <typename T, typename Traits>
  explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);

  template <typename T, typename Traits>
  explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);

  ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
      : producer(other.producer) {
    other.producer = nullptr;
    if (producer != nullptr) {
      producer->token = this;
    }
  }

  inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT {
    swap(other);
    return *this;
  }

  void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT {
    std::swap(producer, other.producer);
    if (producer != nullptr) {
      producer->token = this;
    }
    if (other.producer != nullptr) {
      other.producer->token = &other;
    }
  }

  // A token is always valid unless:
  //     1) Memory allocation failed during construction
  //     2) It was moved via the move constructor
  //        (Note: assignment does a swap, leaving both potentially valid)
  //     3) The associated queue was destroyed
  // Note that if valid() returns true, that only indicates
  // that the token is valid for use with a specific queue,
  // but not which one; that's up to the user to track.
  inline bool valid() const { return producer != nullptr; }

  ~ProducerToken() {
    if (producer != nullptr) {
      producer->token = nullptr;
      producer->inactive.store(true, std::memory_order_release);
    }
  }

  // Disable copying and assignment
  ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
  ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;

 private:
  template <typename T, typename Traits>
  friend class ConcurrentQueue;
  friend class ConcurrentQueueTests;

 protected:
  details::ConcurrentQueueProducerTypelessBase* producer;
};

struct ConsumerToken {
  template <typename T, typename Traits>
  explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);

  template <typename T, typename Traits>
  explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);

  ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
      : initialOffset(other.initialOffset),
        lastKnownGlobalOffset(other.lastKnownGlobalOffset),
        itemsConsumedFromCurrent(other.itemsConsumedFromCurrent),
        currentProducer(other.currentProducer),
        desiredProducer(other.desiredProducer) {}

  inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT {
    swap(other);
    return *this;
  }

  void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT {
    std::swap(initialOffset, other.initialOffset);
    std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
    std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
    std::swap(currentProducer, other.currentProducer);
    std::swap(desiredProducer, other.desiredProducer);
  }

  // Disable copying and assignment
  ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
  ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;

 private:
  template <typename T, typename Traits>
  friend class ConcurrentQueue;
  friend class ConcurrentQueueTests;

 private:  // but shared with ConcurrentQueue
  std::uint32_t initialOffset;
  std::uint32_t lastKnownGlobalOffset;
  std::uint32_t itemsConsumedFromCurrent;
  details::ConcurrentQueueProducerTypelessBase* currentProducer;
  details::ConcurrentQueueProducerTypelessBase* desiredProducer;
};

// Need to forward-declare this swap because it's in a namespace.
// See
// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
template <typename T, typename Traits>
inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b)
    MOODYCAMEL_NOEXCEPT;

template <typename T, typename Traits = ConcurrentQueueDefaultTraits>
class ConcurrentQueue {
 public:
  typedef ::ylt::detail::moodycamel::ProducerToken producer_token_t;
  typedef ::ylt::detail::moodycamel::ConsumerToken consumer_token_t;

  typedef typename Traits::index_t index_t;
  typedef typename Traits::size_t size_t;

  static const size_t QUEUE_BLOCK_SIZE =
      static_cast<size_t>(Traits::QUEUE_BLOCK_SIZE);
  static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD =
      static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
  static const size_t EXPLICIT_INITIAL_INDEX_SIZE =
      static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
  static const size_t IMPLICIT_INITIAL_INDEX_SIZE =
      static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
  static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE =
      static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
  static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
      static_cast<std::uint32_t>(
          Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4307)  // + integral constant overflow (that's what
                                 // the ternary expression is for!)
#pragma warning(disable : 4309)  // static_cast: Truncation of constant value
#endif
  static const size_t MAX_SUBQUEUE_SIZE =
      (details::const_numeric_max<size_t>::value -
           static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) <
       QUEUE_BLOCK_SIZE)
          ? details::const_numeric_max<size_t>::value
          : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) +
              (QUEUE_BLOCK_SIZE - 1)) /
             QUEUE_BLOCK_SIZE * QUEUE_BLOCK_SIZE);
#ifdef _MSC_VER
#pragma warning(pop)
#endif

  static_assert(!std::numeric_limits<size_t>::is_signed &&
                    std::is_integral<size_t>::value,
                "Traits::size_t must be an unsigned integral type");
  static_assert(!std::numeric_limits<index_t>::is_signed &&
                    std::is_integral<index_t>::value,
                "Traits::index_t must be an unsigned integral type");
  static_assert(sizeof(index_t) >= sizeof(size_t),
                "Traits::index_t must be at least as wide as Traits::size_t");
  static_assert(
      (QUEUE_BLOCK_SIZE > 1) && !(QUEUE_BLOCK_SIZE & (QUEUE_BLOCK_SIZE - 1)),
      "Traits::QUEUE_BLOCK_SIZE must be a power of 2 (and at least 2)");
  static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) &&
                    !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD &
                      (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)),
                "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a "
                "power of 2 (and greater than 1)");
  static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) &&
                    !(EXPLICIT_INITIAL_INDEX_SIZE &
                      (EXPLICIT_INITIAL_INDEX_SIZE - 1)),
                "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
                "greater than 1)");
  static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) &&
                    !(IMPLICIT_INITIAL_INDEX_SIZE &
                      (IMPLICIT_INITIAL_INDEX_SIZE - 1)),
                "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
                "greater than 1)");
  static_assert(
      (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) ||
          !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE &
            (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)),
      "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
  static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 ||
                    INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1,
                "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least "
                "1 (or 0 to disable implicit enqueueing)");

 public:
  // Creates a queue with at least `capacity` element slots; note that the
  // actual number of elements that can be inserted without additional memory
  // allocation depends on the number of producers and the block size (e.g. if
  // the block size is equal to `capacity`, only a single block will be
  // allocated up-front, which means only a single producer will be able to
  // enqueue elements without an extra allocation -- blocks aren't shared
  // between producers). This method is not thread safe -- it is up to the user
  // to ensure that the queue is fully constructed before it starts being used
  // by other threads (this includes making the memory effects of construction
  // visible, possibly with a memory barrier).
  explicit ConcurrentQueue(size_t capacity = 32 * QUEUE_BLOCK_SIZE)
      : producerListTail(nullptr),
        producerCount(0),
        initialBlockPoolIndex(0),
        nextExplicitConsumerId(0),
        globalExplicitConsumerOffset(0) {
    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
    populate_initial_implicit_producer_hash();
    populate_initial_block_list(
        capacity / QUEUE_BLOCK_SIZE +
        ((capacity & (QUEUE_BLOCK_SIZE - 1)) == 0 ? 0 : 1));

#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
    // Track all the producers using a fully-resolved typed list for
    // each kind; this makes it possible to debug them starting from
    // the root queue object (otherwise wacky casts are needed that
    // don't compile in the debugger's expression evaluator).
    explicitProducers.store(nullptr, std::memory_order_relaxed);
    implicitProducers.store(nullptr, std::memory_order_relaxed);
#endif
  }

  // Computes the correct amount of pre-allocated blocks for you based
  // on the minimum number of elements you want available at any given
  // time, and the maximum concurrent number of each type of producer.
  ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers,
                  size_t maxImplicitProducers)
      : producerListTail(nullptr),
        producerCount(0),
        initialBlockPoolIndex(0),
        nextExplicitConsumerId(0),
        globalExplicitConsumerOffset(0) {
    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
    populate_initial_implicit_producer_hash();
    size_t blocks =
        (((minCapacity + QUEUE_BLOCK_SIZE - 1) / QUEUE_BLOCK_SIZE) - 1) *
            (maxExplicitProducers + 1) +
        2 * (maxExplicitProducers + maxImplicitProducers);
    populate_initial_block_list(blocks);

#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
    explicitProducers.store(nullptr, std::memory_order_relaxed);
    implicitProducers.store(nullptr, std::memory_order_relaxed);
#endif
  }

  // Note: The queue should not be accessed concurrently while it's
  // being deleted. It's up to the user to synchronize this.
  // This method is not thread safe.
  ~ConcurrentQueue() {
    // Destroy producers
    auto ptr = producerListTail.load(std::memory_order_relaxed);
    while (ptr != nullptr) {
      auto next = ptr->next_prod();
      if (ptr->token != nullptr) {
        ptr->token->producer = nullptr;
      }
      destroy(ptr);
      ptr = next;
    }

    // Destroy implicit producer hash tables
    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
      auto hash = implicitProducerHash.load(std::memory_order_relaxed);
      while (hash != nullptr) {
        auto prev = hash->prev;
        if (prev != nullptr) {  // The last hash is part of this object and was
                                // not allocated dynamically
          for (size_t i = 0; i != hash->capacity; ++i) {
            hash->entries[i].~ImplicitProducerKVP();
          }
          hash->~ImplicitProducerHash();
          (Traits::free)(hash);
        }
        hash = prev;
      }
    }

    // Destroy global free list
    auto block = freeList.head_unsafe();
    while (block != nullptr) {
      auto next = block->freeListNext.load(std::memory_order_relaxed);
      if (block->dynamicallyAllocated) {
        destroy(block);
      }
      block = next;
    }

    // Destroy initial free list
    destroy_array(initialBlockPool, initialBlockPoolSize);
  }

  // Disable copying and copy assignment
  ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
  ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;

  // Moving is supported, but note that it is *not* a thread-safe operation.
  // Nobody can use the queue while it's being moved, and the memory effects
  // of that move must be propagated to other threads before they can use it.
  // Note: When a queue is moved, its tokens are still valid but can only be
  // used with the destination queue (i.e. semantically they are moved along
  // with the queue itself).
  ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
      : producerListTail(
            other.producerListTail.load(std::memory_order_relaxed)),
        producerCount(other.producerCount.load(std::memory_order_relaxed)),
        initialBlockPoolIndex(
            other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
        initialBlockPool(other.initialBlockPool),
        initialBlockPoolSize(other.initialBlockPoolSize),
        freeList(std::move(other.freeList)),
        nextExplicitConsumerId(
            other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
        globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(
            std::memory_order_relaxed)) {
    // Move the other one into this, and leave the other one as an empty queue
    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
    populate_initial_implicit_producer_hash();
    swap_implicit_producer_hashes(other);

    other.producerListTail.store(nullptr, std::memory_order_relaxed);
    other.producerCount.store(0, std::memory_order_relaxed);
    other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
    other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);

#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
    explicitProducers.store(
        other.explicitProducers.load(std::memory_order_relaxed),
        std::memory_order_relaxed);
    other.explicitProducers.store(nullptr, std::memory_order_relaxed);
    implicitProducers.store(
        other.implicitProducers.load(std::memory_order_relaxed),
        std::memory_order_relaxed);
    other.implicitProducers.store(nullptr, std::memory_order_relaxed);
#endif

    other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
    other.initialBlockPoolSize = 0;
    other.initialBlockPool = nullptr;

    reown_producers();
  }

  inline ConcurrentQueue& operator=(ConcurrentQueue&& other)
      MOODYCAMEL_NOEXCEPT {
    return swap_internal(other);
  }

  // Swaps this queue's state with the other's. Not thread-safe.
  // Swapping two queues does not invalidate their tokens, however
  // the tokens that were created for one queue must be used with
  // only the swapped queue (i.e. the tokens are tied to the
  // queue's movable state, not the object itself).
  inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT {
    swap_internal(other);
  }

 private:
  ConcurrentQueue& swap_internal(ConcurrentQueue& other) {
    if (this == &other) {
      return *this;
    }

    details::swap_relaxed(producerListTail, other.producerListTail);
    details::swap_relaxed(producerCount, other.producerCount);
    details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
    std::swap(initialBlockPool, other.initialBlockPool);
    std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
    freeList.swap(other.freeList);
    details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
    details::swap_relaxed(globalExplicitConsumerOffset,
                          other.globalExplicitConsumerOffset);

    swap_implicit_producer_hashes(other);

    reown_producers();
    other.reown_producers();

#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
    details::swap_relaxed(explicitProducers, other.explicitProducers);
    details::swap_relaxed(implicitProducers, other.implicitProducers);
#endif

    return *this;
  }

 public:
  // Enqueues a single item (by copying it).
  // Allocates memory if required. Only fails if memory allocation fails (or
  // implicit production is disabled because
  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
  // Thread-safe.
  inline bool enqueue(T const& item) {
    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
    return false;
    else return inner_enqueue<CanAlloc>(item);
  }

  // Enqueues a single item (by moving it, if possible).
  // Allocates memory if required. Only fails if memory allocation fails (or
  // implicit production is disabled because
  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
  // Thread-safe.
  inline bool enqueue(T&& item) {
    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
    return false;
    else return inner_enqueue<CanAlloc>(std::move(item));
  }

  // Enqueues a single item (by copying it) using an explicit producer token.
  // Allocates memory if required. Only fails if memory allocation fails (or
  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
  // Thread-safe.
  inline bool enqueue(producer_token_t const& token, T const& item) {
    return inner_enqueue<CanAlloc>(token, item);
  }

  // Enqueues a single item (by moving it, if possible) using an explicit
  // producer token. Allocates memory if required. Only fails if memory
  // allocation fails (or Traits::MAX_SUBQUEUE_SIZE has been defined and would
  // be surpassed). Thread-safe.
  inline bool enqueue(producer_token_t const& token, T&& item) {
    return inner_enqueue<CanAlloc>(token, std::move(item));
  }

  // Enqueues several items.
  // Allocates memory if required. Only fails if memory allocation fails (or
  // implicit production is disabled because
  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). Note:
  // Use std::make_move_iterator if the elements should be moved instead of
  // copied. Thread-safe.
  template <typename It>
  bool enqueue_bulk(It itemFirst, size_t count) {
    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
    return false;
    else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
  }

  // Enqueues several items using an explicit producer token.
  // Allocates memory if required. Only fails if memory allocation fails
  // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
  // Note: Use std::make_move_iterator if the elements should be moved
  // instead of copied.
  // Thread-safe.
  template <typename It>
  bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) {
    return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
  }

  // Enqueues a single item (by copying it).
  // Does not allocate memory. Fails if not enough room to enqueue (or implicit
  // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
  // is 0).
  // Thread-safe.
  inline bool try_enqueue(T const& item) {
    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
    return false;
    else return inner_enqueue<CannotAlloc>(item);
  }

  // Enqueues a single item (by moving it, if possible).
  // Does not allocate memory (except for one-time implicit producer).
  // Fails if not enough room to enqueue (or implicit production is
  // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
  // Thread-safe.
  inline bool try_enqueue(T&& item) {
    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
    return false;
    else return inner_enqueue<CannotAlloc>(std::move(item));
  }

  // Enqueues a single item (by copying it) using an explicit producer token.
  // Does not allocate memory. Fails if not enough room to enqueue.
  // Thread-safe.
  inline bool try_enqueue(producer_token_t const& token, T const& item) {
    return inner_enqueue<CannotAlloc>(token, item);
  }

  // Enqueues a single item (by moving it, if possible) using an explicit
  // producer token. Does not allocate memory. Fails if not enough room to
  // enqueue. Thread-safe.
  inline bool try_enqueue(producer_token_t const& token, T&& item) {
    return inner_enqueue<CannotAlloc>(token, std::move(item));
  }

  // Enqueues several items.
  // Does not allocate memory (except for one-time implicit producer).
  // Fails if not enough room to enqueue (or implicit production is
  // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
  // Note: Use std::make_move_iterator if the elements should be moved
  // instead of copied.
  // Thread-safe.
  template <typename It>
  bool try_enqueue_bulk(It itemFirst, size_t count) {
    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
    return false;
    else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
  }

  // Enqueues several items using an explicit producer token.
  // Does not allocate memory. Fails if not enough room to enqueue.
  // Note: Use std::make_move_iterator if the elements should be moved
  // instead of copied.
  // Thread-safe.
  template <typename It>
  bool try_enqueue_bulk(producer_token_t const& token, It itemFirst,
                        size_t count) {
    return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
  }

  // Attempts to dequeue from the queue.
  // Returns false if all producer streams appeared empty at the time they
  // were checked (so, the queue is likely but not guaranteed to be empty).
  // Never allocates. Thread-safe.
  template <typename U>
  bool try_dequeue(U& item) {
    // Instead of simply trying each producer in turn (which could cause
    // needless contention on the first producer), we score them heuristically.
    size_t nonEmptyCount = 0;
    ProducerBase* best = nullptr;
    size_t bestSize = 0;
    for (auto ptr = producerListTail.load(std::memory_order_acquire);
         nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
      auto size = ptr->size_approx();
      if (size > 0) {
        if (size > bestSize) {
          bestSize = size;
          best = ptr;
        }
        ++nonEmptyCount;
      }
    }

    // If there was at least one non-empty queue but it appears empty at the
    // time we try to dequeue from it, we need to make sure every queue's been
    // tried
    if (nonEmptyCount > 0) {
      if ((details::likely)(best->dequeue(item))) {
        return true;
      }
      for (auto ptr = producerListTail.load(std::memory_order_acquire);
           ptr != nullptr; ptr = ptr->next_prod()) {
        if (ptr != best && ptr->dequeue(item)) {
          return true;
        }
      }
    }
    return false;
  }

  // Attempts to dequeue from the queue.
  // Returns false if all producer streams appeared empty at the time they
  // were checked (so, the queue is likely but not guaranteed to be empty).
  // This differs from the try_dequeue(item) method in that this one does
  // not attempt to reduce contention by interleaving the order that producer
  // streams are dequeued from. So, using this method can reduce overall
  // throughput under contention, but will give more predictable results in
  // single-threaded consumer scenarios. This is mostly only useful for internal
  // unit tests. Never allocates. Thread-safe.
  template <typename U>
  bool try_dequeue_non_interleaved(U& item) {
    for (auto ptr = producerListTail.load(std::memory_order_acquire);
         ptr != nullptr; ptr = ptr->next_prod()) {
      if (ptr->dequeue(item)) {
        return true;
      }
    }
    return false;
  }

  // Attempts to dequeue from the queue using an explicit consumer token.
  // Returns false if all producer streams appeared empty at the time they
  // were checked (so, the queue is likely but not guaranteed to be empty).
  // Never allocates. Thread-safe.
  template <typename U>
  bool try_dequeue(consumer_token_t& token, U& item) {
    // The idea is roughly as follows:
    // Every 256 items from one producer, make everyone rotate (increase the
    // global offset) -> this means the highest efficiency consumer dictates the
    // rotation speed of everyone else, more or less If you see that the global
    // offset has changed, you must reset your consumption counter and move to
    // your designated place If there's no items where you're supposed to be,
    // keep moving until you find a producer with some items If the global
    // offset has not changed but you've run out of items to consume, move over
    // from your current position until you find an producer with something in
    // it

    if (token.desiredProducer == nullptr ||
        token.lastKnownGlobalOffset !=
            globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
      if (!update_current_producer_after_rotation(token)) {
        return false;
      }
    }

    // If there was at least one non-empty queue but it appears empty at the
    // time we try to dequeue from it, we need to make sure every queue's been
    // tried
    if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
      if (++token.itemsConsumedFromCurrent ==
          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
        globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
      }
      return true;
    }

    auto tail = producerListTail.load(std::memory_order_acquire);
    auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
    if (ptr == nullptr) {
      ptr = tail;
    }
    while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
      if (ptr->dequeue(item)) {
        token.currentProducer = ptr;
        token.itemsConsumedFromCurrent = 1;
        return true;
      }
      ptr = ptr->next_prod();
      if (ptr == nullptr) {
        ptr = tail;
      }
    }
    return false;
  }

  // Attempts to dequeue several elements from the queue.
  // Returns the number of items actually dequeued.
  // Returns 0 if all producer streams appeared empty at the time they
  // were checked (so, the queue is likely but not guaranteed to be empty).
  // Never allocates. Thread-safe.
  template <typename It>
  size_t try_dequeue_bulk(It itemFirst, size_t max) {
    size_t count = 0;
    for (auto ptr = producerListTail.load(std::memory_order_acquire);
         ptr != nullptr; ptr = ptr->next_prod()) {
      count += ptr->dequeue_bulk(itemFirst, max - count);
      if (count == max) {
        break;
      }
    }
    return count;
  }

  // Attempts to dequeue several elements from the queue using an explicit
  // consumer token. Returns the number of items actually dequeued. Returns 0 if
  // all producer streams appeared empty at the time they were checked (so, the
  // queue is likely but not guaranteed to be empty). Never allocates.
  // Thread-safe.
  template <typename It>
  size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) {
    if (token.desiredProducer == nullptr ||
        token.lastKnownGlobalOffset !=
            globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
      if (!update_current_producer_after_rotation(token)) {
        return 0;
      }
    }

    size_t count = static_cast<ProducerBase*>(token.currentProducer)
                       ->dequeue_bulk(itemFirst, max);
    if (count == max) {
      if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >=
          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
        globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
      }
      return max;
    }
    token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
    max -= count;

    auto tail = producerListTail.load(std::memory_order_acquire);
    auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
    if (ptr == nullptr) {
      ptr = tail;
    }
    while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
      auto dequeued = ptr->dequeue_bulk(itemFirst, max);
      count += dequeued;
      if (dequeued != 0) {
        token.currentProducer = ptr;
        token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
      }
      if (dequeued == max) {
        break;
      }
      max -= dequeued;
      ptr = ptr->next_prod();
      if (ptr == nullptr) {
        ptr = tail;
      }
    }
    return count;
  }

  // Attempts to dequeue from a specific producer's inner queue.
  // If you happen to know which producer you want to dequeue from, this
  // is significantly faster than using the general-case try_dequeue methods.
  // Returns false if the producer's queue appeared empty at the time it
  // was checked (so, the queue is likely but not guaranteed to be empty).
  // Never allocates. Thread-safe.
  template <typename U>
  inline bool try_dequeue_from_producer(producer_token_t const& producer,
                                        U& item) {
    return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
  }

  // Attempts to dequeue several elements from a specific producer's inner
  // queue. Returns the number of items actually dequeued. If you happen to know
  // which producer you want to dequeue from, this is significantly faster than
  // using the general-case try_dequeue methods. Returns 0 if the producer's
  // queue appeared empty at the time it was checked (so, the queue is likely
  // but not guaranteed to be empty). Never allocates. Thread-safe.
  template <typename It>
  inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer,
                                               It itemFirst, size_t max) {
    return static_cast<ExplicitProducer*>(producer.producer)
        ->dequeue_bulk(itemFirst, max);
  }

  // Returns an estimate of the total number of elements currently in the queue.
  // This estimate is only accurate if the queue has completely stabilized
  // before it is called (i.e. all enqueue and dequeue operations have completed
  // and their memory effects are visible on the calling thread, and no further
  // operations start while this method is being called). Thread-safe.
  size_t size_approx() const {
    size_t size = 0;
    for (auto ptr = producerListTail.load(std::memory_order_acquire);
         ptr != nullptr; ptr = ptr->next_prod()) {
      size += ptr->size_approx();
    }
    return size;
  }

  // Returns true if the underlying atomic variables used by
  // the queue are lock-free (they should be on most platforms).
  // Thread-safe.
  static constexpr bool is_lock_free() {
    return details::static_is_lock_free<bool>::value == 2 &&
           details::static_is_lock_free<size_t>::value == 2 &&
           details::static_is_lock_free<std::uint32_t>::value == 2 &&
           details::static_is_lock_free<index_t>::value == 2 &&
           details::static_is_lock_free<void*>::value == 2 &&
           details::static_is_lock_free<typename details::thread_id_converter<
               details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
  }

 private:
  friend struct ProducerToken;
  friend struct ConsumerToken;
  struct ExplicitProducer;
  friend struct ExplicitProducer;
  struct ImplicitProducer;
  friend struct ImplicitProducer;
  friend class ConcurrentQueueTests;

  enum AllocationMode { CanAlloc, CannotAlloc };

  ///////////////////////////////
  // Queue methods
  ///////////////////////////////

  template <AllocationMode canAlloc, typename U>
  inline bool inner_enqueue(producer_token_t const& token, U&& element) {
    return static_cast<ExplicitProducer*>(token.producer)
        ->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(
            std::forward<U>(element));
  }

  template <AllocationMode canAlloc, typename U>
  inline bool inner_enqueue(U&& element) {
    auto producer = get_or_add_implicit_producer();
    return producer == nullptr
               ? false
               : producer->ConcurrentQueue::ImplicitProducer::template enqueue<
                     canAlloc>(std::forward<U>(element));
  }

  template <AllocationMode canAlloc, typename It>
  inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst,
                                 size_t count) {
    return static_cast<ExplicitProducer*>(token.producer)
        ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(
            itemFirst, count);
  }

  template <AllocationMode canAlloc, typename It>
  inline bool inner_enqueue_bulk(It itemFirst, size_t count) {
    auto producer = get_or_add_implicit_producer();
    return producer == nullptr
               ? false
               : producer->ConcurrentQueue::ImplicitProducer::
                     template enqueue_bulk<canAlloc>(itemFirst, count);
  }

  inline bool update_current_producer_after_rotation(consumer_token_t& token) {
    // Ah, there's been a rotation, figure out where we should be!
    auto tail = producerListTail.load(std::memory_order_acquire);
    if (token.desiredProducer == nullptr && tail == nullptr) {
      return false;
    }
    auto prodCount = producerCount.load(std::memory_order_relaxed);
    auto globalOffset =
        globalExplicitConsumerOffset.load(std::memory_order_relaxed);
    if ((details::unlikely)(token.desiredProducer == nullptr)) {
      // Aha, first time we're dequeueing anything.
      // Figure out our local position
      // Note: offset is from start, not end, but we're traversing from end --
      // subtract from count first
      std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
      token.desiredProducer = tail;
      for (std::uint32_t i = 0; i != offset; ++i) {
        token.desiredProducer =
            static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
        if (token.desiredProducer == nullptr) {
          token.desiredProducer = tail;
        }
      }
    }

    std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
    if (delta >= prodCount) {
      delta = delta % prodCount;
    }
    for (std::uint32_t i = 0; i != delta; ++i) {
      token.desiredProducer =
          static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
      if (token.desiredProducer == nullptr) {
        token.desiredProducer = tail;
      }
    }

    token.lastKnownGlobalOffset = globalOffset;
    token.currentProducer = token.desiredProducer;
    token.itemsConsumedFromCurrent = 0;
    return true;
  }

  ///////////////////////////
  // Free list
  ///////////////////////////

  template <typename N>
  struct FreeListNode {
    FreeListNode() : freeListRefs(0), freeListNext(nullptr) {}

    std::atomic<std::uint32_t> freeListRefs;
    std::atomic<N*> freeListNext;
  };

  // A simple CAS-based lock-free free list. Not the fastest thing in the world
  // under heavy contention, but simple and correct (assuming nodes are never
  // freed until after the free list is destroyed), and fairly speedy under low
  // contention.
  template <typename N>  // N must inherit FreeListNode or have the same fields
                         // (and initialization of them)
  struct FreeList {
    FreeList() : freeListHead(nullptr) {}
    FreeList(FreeList&& other)
        : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) {
      other.freeListHead.store(nullptr, std::memory_order_relaxed);
    }
    void swap(FreeList& other) {
      details::swap_relaxed(freeListHead, other.freeListHead);
    }

    FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
    FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;

    inline void add(N* node) {
#ifdef MCDBGQ_NOLOCKFREE_FREELIST
      debug::DebugLock lock(mutex);
#endif
      // We know that the should-be-on-freelist bit is 0 at this point, so it's
      // safe to set it using a fetch_add
      if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST,
                                       std::memory_order_acq_rel) == 0) {
        // Oh look! We were the last ones referencing this node, and we know
        // we want to add it to the free list, so let's do it!
        add_knowing_refcount_is_zero(node);
      }
    }

    inline N* try_get() {
#ifdef MCDBGQ_NOLOCKFREE_FREELIST
      debug::DebugLock lock(mutex);
#endif
      auto head = freeListHead.load(std::memory_order_acquire);
      while (head != nullptr) {
        auto prevHead = head;
        auto refs = head->freeListRefs.load(std::memory_order_relaxed);
        if ((refs & REFS_MASK) == 0 ||
            !head->freeListRefs.compare_exchange_strong(
                refs, refs + 1, std::memory_order_acquire,
                std::memory_order_relaxed)) {
          head = freeListHead.load(std::memory_order_acquire);
          continue;
        }

        // Good, reference count has been incremented (it wasn't at zero), which
        // means we can read the next and not worry about it changing between
        // now and the time we do the CAS
        auto next = head->freeListNext.load(std::memory_order_relaxed);
        if (freeListHead.compare_exchange_strong(head, next,
                                                 std::memory_order_acquire,
                                                 std::memory_order_relaxed)) {
          // Yay, got the node. This means it was on the list, which means
          // shouldBeOnFreeList must be false no matter the refcount (because
          // nobody else knows it's been taken off yet, it can't have been put
          // back on).
          assert((head->freeListRefs.load(std::memory_order_relaxed) &
                  SHOULD_BE_ON_FREELIST) == 0);

          // Decrease refcount twice, once for our ref, and once for the list's
          // ref
          head->freeListRefs.fetch_sub(2, std::memory_order_release);
          return head;
        }

        // OK, the head must have changed on us, but we still need to decrease
        // the refcount we increased. Note that we don't need to release any
        // memory effects, but we do need to ensure that the reference count
        // decrement happens-after the CAS on the head.
        refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
        if (refs == SHOULD_BE_ON_FREELIST + 1) {
          add_knowing_refcount_is_zero(prevHead);
        }
      }

      return nullptr;
    }

    // Useful for traversing the list when there's no contention (e.g. to
    // destroy remaining nodes)
    N* head_unsafe() const {
      return freeListHead.load(std::memory_order_relaxed);
    }

   private:
    inline void add_knowing_refcount_is_zero(N* node) {
      // Since the refcount is zero, and nobody can increase it once it's zero
      // (except us, and we run only one copy of this method per node at a time,
      // i.e. the single thread case), then we know we can safely change the
      // next pointer of the node; however, once the refcount is back above
      // zero, then other threads could increase it (happens under heavy
      // contention, when the refcount goes to zero in between a load and a
      // refcount increment of a node in try_get, then back up to something
      // non-zero, then the refcount increment is done by the other thread) --
      // so, if the CAS to add the node to the actual list fails, decrease the
      // refcount and leave the add operation to the next thread who puts the
      // refcount back at zero (which could be us, hence the loop).
      auto head = freeListHead.load(std::memory_order_relaxed);
      while (true) {
        node->freeListNext.store(head, std::memory_order_relaxed);
        node->freeListRefs.store(1, std::memory_order_release);
        if (!freeListHead.compare_exchange_strong(head, node,
                                                  std::memory_order_release,
                                                  std::memory_order_relaxed)) {
          // Hmm, the add failed, but we can only try again when the refcount
          // goes back to zero
          if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1,
                                           std::memory_order_release) == 1) {
            continue;
          }
        }
        return;
      }
    }

   private:
    // Implemented like a stack, but where node order doesn't matter (nodes are
    // inserted out of order under contention)
    std::atomic<N*> freeListHead;

    static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
    static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;

#ifdef MCDBGQ_NOLOCKFREE_FREELIST
    debug::DebugMutex mutex;
#endif
  };

  ///////////////////////////
  // Block
  ///////////////////////////

  enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };

  struct Block {
    Block()
        : next(nullptr),
          elementsCompletelyDequeued(0),
          freeListRefs(0),
          freeListNext(nullptr),
          dynamicallyAllocated(true) {
#ifdef MCDBGQ_TRACKMEM
      owner = nullptr;
#endif
    }

    template <InnerQueueContext context>
    inline bool is_empty() const {
      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
                              QUEUE_BLOCK_SIZE <=
                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
        // Check flags
        for (size_t i = 0; i < QUEUE_BLOCK_SIZE; ++i) {
          if (!emptyFlags[i].load(std::memory_order_relaxed)) {
            return false;
          }
        }

        // Aha, empty; make sure we have all other memory effects that happened
        // before the empty flags were set
        std::atomic_thread_fence(std::memory_order_acquire);
        return true;
      }
      else {
        // Check counter
        if (elementsCompletelyDequeued.load(std::memory_order_relaxed) ==
            QUEUE_BLOCK_SIZE) {
          std::atomic_thread_fence(std::memory_order_acquire);
          return true;
        }
        assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <=
               QUEUE_BLOCK_SIZE);
        return false;
      }
    }

    // Returns true if the block is now empty (does not apply in explicit
    // context)
    template <InnerQueueContext context>
    inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) {
      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
                              QUEUE_BLOCK_SIZE <=
                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
        // Set flag
        assert(!emptyFlags[QUEUE_BLOCK_SIZE - 1 -
                           static_cast<size_t>(
                               i & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1))]
                    .load(std::memory_order_relaxed));
        emptyFlags[QUEUE_BLOCK_SIZE - 1 -
                   static_cast<size_t>(
                       i & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1))]
            .store(true, std::memory_order_release);
        return false;
      }
      else {
        // Increment counter
        auto prevVal =
            elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
        assert(prevVal < QUEUE_BLOCK_SIZE);
        return prevVal == QUEUE_BLOCK_SIZE - 1;
      }
    }

    // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping
    // and count > 0). Returns true if the block is now empty (does not apply in
    // explicit context).
    template <InnerQueueContext context>
    inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i,
                               size_t count) {
      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
                              QUEUE_BLOCK_SIZE <=
                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
        // Set flags
        std::atomic_thread_fence(std::memory_order_release);
        i = QUEUE_BLOCK_SIZE - 1 -
            static_cast<size_t>(i &
                                static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) -
            count + 1;
        for (size_t j = 0; j != count; ++j) {
          assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
          emptyFlags[i + j].store(true, std::memory_order_relaxed);
        }
        return false;
      }
      else {
        // Increment counter
        auto prevVal = elementsCompletelyDequeued.fetch_add(
            count, std::memory_order_release);
        assert(prevVal + count <= QUEUE_BLOCK_SIZE);
        return prevVal + count == QUEUE_BLOCK_SIZE;
      }
    }

    template <InnerQueueContext context>
    inline void set_all_empty() {
      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
                              QUEUE_BLOCK_SIZE <=
                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
        // Set all flags
        for (size_t i = 0; i != QUEUE_BLOCK_SIZE; ++i) {
          emptyFlags[i].store(true, std::memory_order_relaxed);
        }
      }
      else {
        // Reset counter
        elementsCompletelyDequeued.store(QUEUE_BLOCK_SIZE,
                                         std::memory_order_relaxed);
      }
    }

    template <InnerQueueContext context>
    inline void reset_empty() {
      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
                              QUEUE_BLOCK_SIZE <=
                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
        // Reset flags
        for (size_t i = 0; i != QUEUE_BLOCK_SIZE; ++i) {
          emptyFlags[i].store(false, std::memory_order_relaxed);
        }
      }
      else {
        // Reset counter
        elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
      }
    }

    inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT {
      return static_cast<T*>(static_cast<void*>(elements)) +
             static_cast<size_t>(idx &
                                 static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
    }
    inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT {
      return static_cast<T const*>(static_cast<void const*>(elements)) +
             static_cast<size_t>(idx &
                                 static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
    }

   private:
    static_assert(std::alignment_of<T>::value <= sizeof(T),
                  "The queue does not support types with an alignment greater "
                  "than their size at this time");
    MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * QUEUE_BLOCK_SIZE], T)
    elements;

   public:
    Block* next;
    std::atomic<size_t> elementsCompletelyDequeued;
    std::atomic<bool>
        emptyFlags[QUEUE_BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD
                       ? QUEUE_BLOCK_SIZE
                       : 1];

   public:
    std::atomic<std::uint32_t> freeListRefs;
    std::atomic<Block*> freeListNext;
    bool dynamicallyAllocated;  // Perhaps a better name for this would be
                                // 'isNotPartOfInitialBlockPool'

#ifdef MCDBGQ_TRACKMEM
    void* owner;
#endif
  };
  static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value,
                "Internal error: Blocks must be at least as aligned as the "
                "type they are wrapping");

#ifdef MCDBGQ_TRACKMEM
 public:
  struct MemStats;

 private:
#endif

  ///////////////////////////
  // Producer base
  ///////////////////////////

  struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase {
    ProducerBase(ConcurrentQueue* parent_, bool isExplicit_)
        : tailIndex(0),
          headIndex(0),
          dequeueOptimisticCount(0),
          dequeueOvercommit(0),
          tailBlock(nullptr),
          isExplicit(isExplicit_),
          parent(parent_) {}

    virtual ~ProducerBase() {}

    template <typename U>
    inline bool dequeue(U& element) {
      if (isExplicit) {
        return static_cast<ExplicitProducer*>(this)->dequeue(element);
      }
      else {
        return static_cast<ImplicitProducer*>(this)->dequeue(element);
      }
    }

    template <typename It>
    inline size_t dequeue_bulk(It& itemFirst, size_t max) {
      if (isExplicit) {
        return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst,
                                                                  max);
      }
      else {
        return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst,
                                                                  max);
      }
    }

    inline ProducerBase* next_prod() const {
      return static_cast<ProducerBase*>(next);
    }

    inline size_t size_approx() const {
      auto tail = tailIndex.load(std::memory_order_relaxed);
      auto head = headIndex.load(std::memory_order_relaxed);
      return details::circular_less_than(head, tail)
                 ? static_cast<size_t>(tail - head)
                 : 0;
    }

    inline index_t getTail() const {
      return tailIndex.load(std::memory_order_relaxed);
    }

   protected:
    std::atomic<index_t> tailIndex;  // Where to enqueue to next
    std::atomic<index_t> headIndex;  // Where to dequeue from next

    std::atomic<index_t> dequeueOptimisticCount;
    std::atomic<index_t> dequeueOvercommit;

    Block* tailBlock;

   public:
    bool isExplicit;
    ConcurrentQueue* parent;

   protected:
#ifdef MCDBGQ_TRACKMEM
    friend struct MemStats;
#endif
  };

  ///////////////////////////
  // Explicit queue
  ///////////////////////////

  struct ExplicitProducer : public ProducerBase {
    explicit ExplicitProducer(ConcurrentQueue* parent_)
        : ProducerBase(parent_, true),
          blockIndex(nullptr),
          pr_blockIndexSlotsUsed(0),
          pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
          pr_blockIndexFront(0),
          pr_blockIndexEntries(nullptr),
          pr_blockIndexRaw(nullptr) {
      size_t poolBasedIndexSize =
          details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
      if (poolBasedIndexSize > pr_blockIndexSize) {
        pr_blockIndexSize = poolBasedIndexSize;
      }

      new_block_index(0);  // This creates an index with double the number of
                           // current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
    }

    ~ExplicitProducer() {
      // Destruct any elements not yet dequeued.
      // Since we're in the destructor, we can assume all elements
      // are either completely dequeued or completely not (no halfways).
      if (this->tailBlock !=
          nullptr) {  // Note this means there must be a block index too
        // First find the block that's partially dequeued, if any
        Block* halfDequeuedBlock = nullptr;
        if ((this->headIndex.load(std::memory_order_relaxed) &
             static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) != 0) {
          // The head's not on a block boundary, meaning a block somewhere is
          // partially dequeued (or the head block is the tail block and was
          // fully dequeued, but the head/tail are still not on a boundary)
          size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) &
                     (pr_blockIndexSize - 1);
          while (details::circular_less_than<index_t>(
              pr_blockIndexEntries[i].base + QUEUE_BLOCK_SIZE,
              this->headIndex.load(std::memory_order_relaxed))) {
            i = (i + 1) & (pr_blockIndexSize - 1);
          }
          assert(details::circular_less_than<index_t>(
              pr_blockIndexEntries[i].base,
              this->headIndex.load(std::memory_order_relaxed)));
          halfDequeuedBlock = pr_blockIndexEntries[i].block;
        }

        // Start at the head block (note the first line in the loop gives us the
        // head from the tail on the first iteration)
        auto block = this->tailBlock;
        do {
          block = block->next;
          if (block->ConcurrentQueue::Block::template is_empty<
                  explicit_context>()) {
            continue;
          }

          size_t i = 0;  // Offset into block
          if (block == halfDequeuedBlock) {
            i = static_cast<size_t>(
                this->headIndex.load(std::memory_order_relaxed) &
                static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
          }

          // Walk through all the items in the block; if this is the tail block,
          // we need to stop when we reach the tail index
          auto lastValidIndex =
              (this->tailIndex.load(std::memory_order_relaxed) &
               static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0
                  ? QUEUE_BLOCK_SIZE
                  : static_cast<size_t>(
                        this->tailIndex.load(std::memory_order_relaxed) &
                        static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
          while (i != QUEUE_BLOCK_SIZE &&
                 (block != this->tailBlock || i != lastValidIndex)) {
            (*block)[i++]->~T();
          }
        } while (block != this->tailBlock);
      }

      // Destroy all blocks that we own
      if (this->tailBlock != nullptr) {
        auto block = this->tailBlock;
        do {
          auto nextBlock = block->next;
          this->parent->add_block_to_free_list(block);
          block = nextBlock;
        } while (block != this->tailBlock);
      }

      // Destroy the block indices
      auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
      while (header != nullptr) {
        auto prev = static_cast<BlockIndexHeader*>(header->prev);
        header->~BlockIndexHeader();
        (Traits::free)(header);
        header = prev;
      }
    }

    template <AllocationMode allocMode, typename U>
    inline bool enqueue(U&& element) {
      index_t currentTailIndex =
          this->tailIndex.load(std::memory_order_relaxed);
      index_t newTailIndex = 1 + currentTailIndex;
      if ((currentTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) ==
          0) {
        // We reached the end of a block, start a new one
        auto startBlock = this->tailBlock;
        auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
        if (this->tailBlock != nullptr &&
            this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
                explicit_context>()) {
          // We can re-use the block ahead of us, it's empty!
          this->tailBlock = this->tailBlock->next;
          this->tailBlock->ConcurrentQueue::Block::template reset_empty<
              explicit_context>();

          // We'll put the block on the block index (guaranteed to be room since
          // we're conceptually removing the last block from it first -- except
          // instead of removing then adding, we can just overwrite). Note that
          // there must be a valid block index here, since even if allocation
          // failed in the ctor, it would have been re-attempted when adding the
          // first block to the queue; since there is such a block, a block
          // index must have been successfully allocated.
        }
        else {
          // Whatever head value we see here is >= the last value we saw here
          // (relatively), and <= its current value. Since we have the most
          // recent tail, the head must be
          // <= to it.
          auto head = this->headIndex.load(std::memory_order_relaxed);
          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
          if (!details::circular_less_than<index_t>(
                  head, currentTailIndex + QUEUE_BLOCK_SIZE) ||
              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
               (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE <
                                              currentTailIndex - head))) {
            // We can't enqueue in another block because there's not enough
            // leeway -- the tail could surpass the head by the time the block
            // fills up! (Or we'll exceed the size limit, if the second part of
            // the condition was true.)
            return false;
          }
          // We're going to need a new block; check that the block index has
          // room
          if (pr_blockIndexRaw == nullptr ||
              pr_blockIndexSlotsUsed == pr_blockIndexSize) {
            // Hmm, the circular block index is already full -- we'll need
            // to allocate a new index. Note pr_blockIndexRaw can only be
            // nullptr if the initial allocation failed in the constructor.

            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { return false; }
            else if (!new_block_index(pr_blockIndexSlotsUsed)) {
              return false;
            }
          }

          // Insert a new block in the circular linked list
          auto newBlock =
              this->parent
                  ->ConcurrentQueue::template requisition_block<allocMode>();
          if (newBlock == nullptr) {
            return false;
          }
#ifdef MCDBGQ_TRACKMEM
          newBlock->owner = this;
#endif
          newBlock->ConcurrentQueue::Block::template reset_empty<
              explicit_context>();
          if (this->tailBlock == nullptr) {
            newBlock->next = newBlock;
          }
          else {
            newBlock->next = this->tailBlock->next;
            this->tailBlock->next = newBlock;
          }
          this->tailBlock = newBlock;
          ++pr_blockIndexSlotsUsed;
        }

        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
            T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
          // The constructor may throw. We want the element not to appear in the
          // queue in that case (without corrupting the queue):
          MOODYCAMEL_TRY {
            new ((*this->tailBlock)[currentTailIndex])
                T(std::forward<U>(element));
          }
          MOODYCAMEL_CATCH(...) {
            // Revert change to the current block, but leave the new block
            // available for next time
            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
            this->tailBlock =
                startBlock == nullptr ? this->tailBlock : startBlock;
            MOODYCAMEL_RETHROW;
          }
        }
        else {
          (void)startBlock;
          (void)originalBlockIndexSlotsUsed;
        }

        // Add block to block index
        auto& entry = blockIndex.load(std::memory_order_relaxed)
                          ->entries[pr_blockIndexFront];
        entry.base = currentTailIndex;
        entry.block = this->tailBlock;
        blockIndex.load(std::memory_order_relaxed)
            ->front.store(pr_blockIndexFront, std::memory_order_release);
        pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);

        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
            T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
          this->tailIndex.store(newTailIndex, std::memory_order_release);
          return true;
        }
      }

      // Enqueue
      new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));

      this->tailIndex.store(newTailIndex, std::memory_order_release);
      return true;
    }

    template <typename U>
    bool dequeue(U& element) {
      auto tail = this->tailIndex.load(std::memory_order_relaxed);
      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
      if (details::circular_less_than<index_t>(
              this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
                  overcommit,
              tail)) {
        // Might be something to dequeue, let's give it a try

        // Note that this if is purely for performance purposes in the common
        // case when the queue is empty and the values are eventually consistent
        // -- we may enter here spuriously.

        // Note that whatever the values of overcommit and tail are, they are
        // not going to change (unless we change them) and must be the same
        // value at this point (inside the if) as when the if condition was
        // evaluated.

        // We insert an acquire fence here to synchronize-with the release upon
        // incrementing dequeueOvercommit below. This ensures that whatever the
        // value we got loaded into overcommit, the load of dequeueOptisticCount
        // in the fetch_add below will result in a value at least as recent as
        // that (and therefore at least as large). Note that I believe a
        // compiler (signal) fence here would be sufficient due to the nature of
        // fetch_add (all read-modify-write operations are guaranteed to work on
        // the latest value in the modification order), but unfortunately that
        // can't be shown to be correct using only the C++11 standard. See
        // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
        std::atomic_thread_fence(std::memory_order_acquire);

        // Increment optimistic counter, then check if it went over the boundary
        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
            1, std::memory_order_relaxed);

        // Note that since dequeueOvercommit must be <= dequeueOptimisticCount
        // (because dequeueOvercommit is only ever incremented after
        // dequeueOptimisticCount -- this is enforced in the `else` block
        // below), and since we now have a version of dequeueOptimisticCount
        // that is at least as recent as overcommit (due to the release upon
        // incrementing dequeueOvercommit and the acquire above that
        // synchronizes with it), overcommit <= myDequeueCount. However, we
        // can't assert this since both dequeueOptimisticCount and
        // dequeueOvercommit may (independently) overflow; in such a case,
        // though, the logic still holds since the difference between the two is
        // maintained.

        // Note that we reload tail here in case it changed; it will be the same
        // value as before or greater, since this load is sequenced after
        // (happens after) the earlier load above. This is supported by
        // read-read coherency (as defined in the standard), explained here:
        // http://en.cppreference.com/w/cpp/atomic/memory_order
        tail = this->tailIndex.load(std::memory_order_acquire);
        if ((details::likely)(details::circular_less_than<index_t>(
                myDequeueCount - overcommit, tail))) {
          // Guaranteed to be at least one element to dequeue!

          // Get the index. Note that since there's guaranteed to be at least
          // one element, this will never exceed tail. We need to do an
          // acquire-release fence here since it's possible that whatever
          // condition got us to this point was for an earlier enqueued element
          // (that we already see the memory effects for), but that by the time
          // we increment somebody else has incremented it, and we need to see
          // the memory effects for *that* element, which is in such a case is
          // necessarily visible on the thread that incremented it in the first
          // place with the more current condition (they must have acquired a
          // tail that is at least as recent).
          auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);

          // Determine which block the element is in

          auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
          auto localBlockIndexHead =
              localBlockIndex->front.load(std::memory_order_acquire);

          // We need to be careful here about subtracting and dividing because
          // of index wrap-around. When an index wraps, we need to preserve the
          // sign of the offset when dividing it by the block size (in order to
          // get a correct signed block count offset in all cases):
          auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
          auto blockBaseIndex =
              index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
          auto offset = static_cast<size_t>(
              static_cast<typename std::make_signed<index_t>::type>(
                  blockBaseIndex - headBase) /
              static_cast<typename std::make_signed<index_t>::type>(
                  QUEUE_BLOCK_SIZE));
          auto block = localBlockIndex
                           ->entries[(localBlockIndexHead + offset) &
                                     (localBlockIndex->size - 1)]
                           .block;

          // Dequeue
          auto& el = *((*block)[index]);
          if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
            // Make sure the element is still fully dequeued and destroyed even
            // if the assignment throws
            struct Guard {
              Block* block;
              index_t index;

              ~Guard() {
                (*block)[index]->~T();
                block->ConcurrentQueue::Block::template set_empty<
                    explicit_context>(index);
              }
            } guard = {block, index};

            element = std::move(el);  // NOLINT
          }
          else {
            element = std::move(el);  // NOLINT
            el.~T();                  // NOLINT
            block->ConcurrentQueue::Block::template set_empty<explicit_context>(
                index);
          }

          return true;
        }
        else {
          // Wasn't anything to dequeue after all; make the effective dequeue
          // count eventually consistent
          this->dequeueOvercommit.fetch_add(
              1, std::memory_order_release);  // Release so that the fetch_add
                                              // on dequeueOptimisticCount is
                                              // guaranteed to happen before
                                              // this write
        }
      }

      return false;
    }

    template <AllocationMode allocMode, typename It>
    bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) {
      // First, we need to make sure we have enough room to enqueue all of the
      // elements; this means pre-allocating blocks and putting them in the
      // block index (but only if all the allocations succeeded).
      index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
      auto startBlock = this->tailBlock;
      auto originalBlockIndexFront = pr_blockIndexFront;
      auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;

      Block* firstAllocatedBlock = nullptr;

      // Figure out how many blocks we'll need to allocate, and do so
      size_t blockBaseDiff =
          ((startTailIndex + count - 1) &
           ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) -
          ((startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
      index_t currentTailIndex =
          (startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
      if (blockBaseDiff > 0) {
        // Allocate as many blocks as possible from ahead
        while (blockBaseDiff > 0 && this->tailBlock != nullptr &&
               this->tailBlock->next != firstAllocatedBlock &&
               this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
                   explicit_context>()) {
          blockBaseDiff -= static_cast<index_t>(QUEUE_BLOCK_SIZE);
          currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE);

          this->tailBlock = this->tailBlock->next;
          firstAllocatedBlock = firstAllocatedBlock == nullptr
                                    ? this->tailBlock
                                    : firstAllocatedBlock;

          auto& entry = blockIndex.load(std::memory_order_relaxed)
                            ->entries[pr_blockIndexFront];
          entry.base = currentTailIndex;
          entry.block = this->tailBlock;
          pr_blockIndexFront =
              (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
        }

        // Now allocate as many blocks as necessary from the block pool
        while (blockBaseDiff > 0) {
          blockBaseDiff -= static_cast<index_t>(QUEUE_BLOCK_SIZE);
          currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE);

          auto head = this->headIndex.load(std::memory_order_relaxed);
          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
          bool full =
              !details::circular_less_than<index_t>(
                  head, currentTailIndex + QUEUE_BLOCK_SIZE) ||
              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
               (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE <
                                              currentTailIndex - head));
          if (pr_blockIndexRaw == nullptr ||
              pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
              // Failed to allocate, undo changes (but keep injected blocks)
              pr_blockIndexFront = originalBlockIndexFront;
              pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
              this->tailBlock =
                  startBlock == nullptr ? firstAllocatedBlock : startBlock;
              return false;
            }
            else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
              // Failed to allocate, undo changes (but keep injected blocks)
              pr_blockIndexFront = originalBlockIndexFront;
              pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
              this->tailBlock =
                  startBlock == nullptr ? firstAllocatedBlock : startBlock;
              return false;
            }

            // pr_blockIndexFront is updated inside new_block_index, so we need
            // to update our fallback value too (since we keep the new index
            // even if we later fail)
            originalBlockIndexFront = originalBlockIndexSlotsUsed;
          }

          // Insert a new block in the circular linked list
          auto newBlock =
              this->parent
                  ->ConcurrentQueue::template requisition_block<allocMode>();
          if (newBlock == nullptr) {
            pr_blockIndexFront = originalBlockIndexFront;
            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
            this->tailBlock =
                startBlock == nullptr ? firstAllocatedBlock : startBlock;
            return false;
          }

#ifdef MCDBGQ_TRACKMEM
          newBlock->owner = this;
#endif
          newBlock->ConcurrentQueue::Block::template set_all_empty<
              explicit_context>();
          if (this->tailBlock == nullptr) {
            newBlock->next = newBlock;
          }
          else {
            newBlock->next = this->tailBlock->next;
            this->tailBlock->next = newBlock;
          }
          this->tailBlock = newBlock;
          firstAllocatedBlock = firstAllocatedBlock == nullptr
                                    ? this->tailBlock
                                    : firstAllocatedBlock;

          ++pr_blockIndexSlotsUsed;

          auto& entry = blockIndex.load(std::memory_order_relaxed)
                            ->entries[pr_blockIndexFront];
          entry.base = currentTailIndex;
          entry.block = this->tailBlock;
          pr_blockIndexFront =
              (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
        }

        // Excellent, all allocations succeeded. Reset each block's emptiness
        // before we fill them up, and publish the new block index front
        auto block = firstAllocatedBlock;
        while (true) {
          block->ConcurrentQueue::Block::template reset_empty<
              explicit_context>();
          if (block == this->tailBlock) {
            break;
          }
          block = block->next;
        }

        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
            T, decltype(*itemFirst),
            new (static_cast<T*>(nullptr))
                T(details::deref_noexcept(itemFirst)))) {
          blockIndex.load(std::memory_order_relaxed)
              ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
                            std::memory_order_release);
        }
      }

      // Enqueue, one block at a time
      index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
      currentTailIndex = startTailIndex;
      auto endBlock = this->tailBlock;
      this->tailBlock = startBlock;
      assert((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) !=
                 0 ||
             firstAllocatedBlock != nullptr || count == 0);
      if ((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0 &&
          firstAllocatedBlock != nullptr) {
        this->tailBlock = firstAllocatedBlock;
      }
      while (true) {
        index_t stopIndex =
            (currentTailIndex & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
            static_cast<index_t>(QUEUE_BLOCK_SIZE);
        if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
          stopIndex = newTailIndex;
        }
        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
            T, decltype(*itemFirst),
            new (static_cast<T*>(nullptr))
                T(details::deref_noexcept(itemFirst)))) {
          while (currentTailIndex != stopIndex) {
            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
          }
        }
        else {
          MOODYCAMEL_TRY {
            while (currentTailIndex != stopIndex) {
              // Must use copy constructor even if move constructor is available
              // because we may have to revert if there's an exception.
              // Sorry about the horrible templated next line, but it was the
              // only way to disable moving *at compile time*, which is
              // important because a type may only define a (noexcept) move
              // constructor, and so calls to the cctor will not compile, even
              // if they are in an if branch that will never be executed
              new ((*this->tailBlock)[currentTailIndex]) T(
                  details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
                      T, decltype(*itemFirst),
                      new (static_cast<T*>(nullptr)) T(details::deref_noexcept(
                          itemFirst)))>::eval(*itemFirst));
              ++currentTailIndex;
              ++itemFirst;
            }
          }
          MOODYCAMEL_CATCH(...) {
            // Oh dear, an exception's been thrown -- destroy the elements that
            // were enqueued so far and revert the entire bulk operation (we'll
            // keep any allocated blocks in our linked list for later, though).
            auto constructedStopIndex = currentTailIndex;
            auto lastBlockEnqueued = this->tailBlock;

            pr_blockIndexFront = originalBlockIndexFront;
            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
            this->tailBlock =
                startBlock == nullptr ? firstAllocatedBlock : startBlock;

            if (!details::is_trivially_destructible<T>::value) {
              auto block = startBlock;
              if ((startTailIndex &
                   static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0) {
                block = firstAllocatedBlock;
              }
              currentTailIndex = startTailIndex;
              while (true) {
                stopIndex = (currentTailIndex &
                             ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
                            static_cast<index_t>(QUEUE_BLOCK_SIZE);
                if (details::circular_less_than<index_t>(constructedStopIndex,
                                                         stopIndex)) {
                  stopIndex = constructedStopIndex;
                }
                while (currentTailIndex != stopIndex) {
                  (*block)[currentTailIndex++]->~T();
                }
                if (block == lastBlockEnqueued) {
                  break;
                }
                block = block->next;
              }
            }
            MOODYCAMEL_RETHROW;
          }
        }

        if (this->tailBlock == endBlock) {
          assert(currentTailIndex == newTailIndex);
          break;
        }
        this->tailBlock = this->tailBlock->next;
      }

      MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
          T, decltype(*itemFirst),
          new (static_cast<T*>(nullptr))
              T(details::deref_noexcept(itemFirst)))) {
        if (firstAllocatedBlock != nullptr)
          blockIndex.load(std::memory_order_relaxed)
              ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
                            std::memory_order_release);
      }

      this->tailIndex.store(newTailIndex, std::memory_order_release);
      return true;
    }

    template <typename It>
    size_t dequeue_bulk(It& itemFirst, size_t max) {
      auto tail = this->tailIndex.load(std::memory_order_relaxed);
      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
      auto desiredCount = static_cast<size_t>(
          tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
                  overcommit));
      if (details::circular_less_than<size_t>(0, desiredCount)) {
        desiredCount = desiredCount < max ? desiredCount : max;
        std::atomic_thread_fence(std::memory_order_acquire);

        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
            desiredCount, std::memory_order_relaxed);

        tail = this->tailIndex.load(std::memory_order_acquire);
        auto actualCount =
            static_cast<size_t>(tail - (myDequeueCount - overcommit));
        if (details::circular_less_than<size_t>(0, actualCount)) {
          actualCount = desiredCount < actualCount ? desiredCount : actualCount;
          if (actualCount < desiredCount) {
            this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
                                              std::memory_order_release);
          }

          // Get the first index. Note that since there's guaranteed to be at
          // least actualCount elements, this will never exceed tail.
          auto firstIndex =
              this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);

          // Determine which block the first element is in
          auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
          auto localBlockIndexHead =
              localBlockIndex->front.load(std::memory_order_acquire);

          auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
          auto firstBlockBaseIndex =
              firstIndex & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
          auto offset = static_cast<size_t>(
              static_cast<typename std::make_signed<index_t>::type>(
                  firstBlockBaseIndex - headBase) /
              static_cast<typename std::make_signed<index_t>::type>(
                  QUEUE_BLOCK_SIZE));
          auto indexIndex =
              (localBlockIndexHead + offset) & (localBlockIndex->size - 1);

          // Iterate the blocks and dequeue
          auto index = firstIndex;
          do {
            auto firstIndexInBlock = index;
            index_t endIndex =
                (index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
                static_cast<index_t>(QUEUE_BLOCK_SIZE);
            endIndex =
                details::circular_less_than<index_t>(
                    firstIndex + static_cast<index_t>(actualCount), endIndex)
                    ? firstIndex + static_cast<index_t>(actualCount)
                    : endIndex;
            auto block = localBlockIndex->entries[indexIndex].block;
            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&,
                                           details::deref_noexcept(itemFirst) =
                                               std::move((*(*block)[index])))) {
              while (index != endIndex) {
                auto& el = *((*block)[index]);
                *itemFirst++ = std::move(el);
                el.~T();
                ++index;
              }
            }
            else {
              MOODYCAMEL_TRY {
                while (index != endIndex) {
                  auto& el = *((*block)[index]);
                  *itemFirst = std::move(el);
                  ++itemFirst;
                  el.~T();
                  ++index;
                }
              }
              MOODYCAMEL_CATCH(...) {
                // It's too late to revert the dequeue, but we can make sure
                // that all the dequeued objects are properly destroyed and the
                // block index (and empty count) are properly updated before we
                // propagate the exception
                do {
                  block = localBlockIndex->entries[indexIndex].block;
                  while (index != endIndex) {
                    (*block)[index++]->~T();
                  }
                  block->ConcurrentQueue::Block::template set_many_empty<
                      explicit_context>(
                      firstIndexInBlock,
                      static_cast<size_t>(endIndex - firstIndexInBlock));
                  indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);

                  firstIndexInBlock = index;
                  endIndex =
                      (index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
                      static_cast<index_t>(QUEUE_BLOCK_SIZE);
                  endIndex =
                      details::circular_less_than<index_t>(
                          firstIndex + static_cast<index_t>(actualCount),
                          endIndex)
                          ? firstIndex + static_cast<index_t>(actualCount)
                          : endIndex;
                } while (index != firstIndex + actualCount);

                MOODYCAMEL_RETHROW;
              }
            }
            block->ConcurrentQueue::Block::template set_many_empty<
                explicit_context>(
                firstIndexInBlock,
                static_cast<size_t>(endIndex - firstIndexInBlock));
            indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
          } while (index != firstIndex + actualCount);

          return actualCount;
        }
        else {
          // Wasn't anything to dequeue after all; make the effective dequeue
          // count eventually consistent
          this->dequeueOvercommit.fetch_add(desiredCount,
                                            std::memory_order_release);
        }
      }

      return 0;
    }

   private:
    struct BlockIndexEntry {
      index_t base;
      Block* block;
    };

    struct BlockIndexHeader {
      size_t size;
      std::atomic<size_t>
          front;  // Current slot (not next, like pr_blockIndexFront)
      BlockIndexEntry* entries;
      void* prev;
    };

    bool new_block_index(size_t numberOfFilledSlotsToExpose) {
      auto prevBlockSizeMask = pr_blockIndexSize - 1;

      // Create the new block
      pr_blockIndexSize <<= 1;
      auto newRawPtr = static_cast<char*>((Traits::malloc)(
          sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
          1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
      if (newRawPtr == nullptr) {
        pr_blockIndexSize >>= 1;  // Reset to allow graceful retry
        return false;
      }

      auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(
          details::align_for<BlockIndexEntry>(newRawPtr +
                                              sizeof(BlockIndexHeader)));

      // Copy in all the old indices, if any
      size_t j = 0;
      if (pr_blockIndexSlotsUsed != 0) {
        auto i =
            (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
        do {
          newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
          i = (i + 1) & prevBlockSizeMask;
        } while (i != pr_blockIndexFront);
      }

      // Update everything
      auto header = new (newRawPtr) BlockIndexHeader;
      header->size = pr_blockIndexSize;
      header->front.store(numberOfFilledSlotsToExpose - 1,
                          std::memory_order_relaxed);
      header->entries = newBlockIndexEntries;
      header->prev = pr_blockIndexRaw;  // we link the new block to the old one
                                        // so we can free it later

      pr_blockIndexFront = j;
      pr_blockIndexEntries = newBlockIndexEntries;
      pr_blockIndexRaw = newRawPtr;
      blockIndex.store(header, std::memory_order_release);

      return true;
    }

   private:
    std::atomic<BlockIndexHeader*> blockIndex;

    // To be used by producer only -- consumer must use the ones in referenced
    // by blockIndex
    size_t pr_blockIndexSlotsUsed;
    size_t pr_blockIndexSize;
    size_t pr_blockIndexFront;  // Next slot (not current)
    BlockIndexEntry* pr_blockIndexEntries;
    void* pr_blockIndexRaw;

#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
   public:
    ExplicitProducer* nextExplicitProducer;

   private:
#endif

#ifdef MCDBGQ_TRACKMEM
    friend struct MemStats;
#endif
  };

  //////////////////////////////////
  // Implicit queue
  //////////////////////////////////

  struct ImplicitProducer : public ProducerBase {
    ImplicitProducer(ConcurrentQueue* parent_)
        : ProducerBase(parent_, false),
          nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
          blockIndex(nullptr) {
      new_block_index();
    }

    ~ImplicitProducer() {
      // Note that since we're in the destructor we can assume that all
      // enqueue/dequeue operations completed already; this means that all
      // undequeued elements are placed contiguously across contiguous blocks,
      // and that only the first and last remaining blocks can be only partially
      // empty (all other remaining blocks must be completely full).

#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
      // Unregister ourselves for thread termination notification
      if (!this->inactive.load(std::memory_order_relaxed)) {
        details::ThreadExitNotifier::unsubscribe(&threadExitListener);
      }
#endif

      // Destroy all remaining elements!
      auto tail = this->tailIndex.load(std::memory_order_relaxed);
      auto index = this->headIndex.load(std::memory_order_relaxed);
      Block* block = nullptr;
      assert(index == tail || details::circular_less_than(index, tail));
      bool forceFreeLastBlock =
          index != tail;  // If we enter the loop, then the last (tail) block
                          // will not be freed
      while (index != tail) {
        if ((index & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0 ||
            block == nullptr) {
          if (block != nullptr) {
            // Free the old block
            this->parent->add_block_to_free_list(block);
          }

          block = get_block_index_entry_for_index(index)->value.load(
              std::memory_order_relaxed);
        }

        ((*block)[index])->~T();
        ++index;
      }
      // Even if the queue is empty, there's still one block that's not on the
      // free list (unless the head index reached the end of it, in which case
      // the tail will be poised to create a new block).
      if (this->tailBlock != nullptr &&
          (forceFreeLastBlock ||
           (tail & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) != 0)) {
        this->parent->add_block_to_free_list(this->tailBlock);
      }

      // Destroy block index
      auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
      if (localBlockIndex != nullptr) {
        for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
          localBlockIndex->index[i]->~BlockIndexEntry();
        }
        do {
          auto prev = localBlockIndex->prev;
          localBlockIndex->~BlockIndexHeader();
          (Traits::free)(localBlockIndex);
          localBlockIndex = prev;
        } while (localBlockIndex != nullptr);
      }
    }

    template <AllocationMode allocMode, typename U>
    inline bool enqueue(U&& element) {
      index_t currentTailIndex =
          this->tailIndex.load(std::memory_order_relaxed);
      index_t newTailIndex = 1 + currentTailIndex;
      if ((currentTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) ==
          0) {
        // We reached the end of a block, start a new one
        auto head = this->headIndex.load(std::memory_order_relaxed);
        assert(!details::circular_less_than<index_t>(currentTailIndex, head));
        if (!details::circular_less_than<index_t>(
                head, currentTailIndex + QUEUE_BLOCK_SIZE) ||
            (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
             (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE <
                                            currentTailIndex - head))) {
          return false;
        }
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
        debug::DebugLock lock(mutex);
#endif
        // Find out where we'll be inserting this block in the block index
        BlockIndexEntry* idxEntry;
        if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
          return false;
        }

        // Get ahold of a new block
        auto newBlock =
            this->parent
                ->ConcurrentQueue::template requisition_block<allocMode>();
        if (newBlock == nullptr) {
          rewind_block_index_tail();
          idxEntry->value.store(nullptr, std::memory_order_relaxed);
          return false;
        }
#ifdef MCDBGQ_TRACKMEM
        newBlock->owner = this;
#endif
        newBlock
            ->ConcurrentQueue::Block::template reset_empty<implicit_context>();

        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
            T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
          // May throw, try to insert now before we publish the fact that we
          // have this new block
          MOODYCAMEL_TRY {
            new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
          }
          MOODYCAMEL_CATCH(...) {
            rewind_block_index_tail();
            idxEntry->value.store(nullptr, std::memory_order_relaxed);
            this->parent->add_block_to_free_list(newBlock);
            MOODYCAMEL_RETHROW;
          }
        }

        // Insert the new block into the index
        idxEntry->value.store(newBlock, std::memory_order_relaxed);

        this->tailBlock = newBlock;

        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
            T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
          this->tailIndex.store(newTailIndex, std::memory_order_release);
          return true;
        }
      }

      // Enqueue
      new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));

      this->tailIndex.store(newTailIndex, std::memory_order_release);
      return true;
    }

    template <typename U>
    bool dequeue(U& element) {
      // See ExplicitProducer::dequeue for rationale and explanation
      index_t tail = this->tailIndex.load(std::memory_order_relaxed);
      index_t overcommit =
          this->dequeueOvercommit.load(std::memory_order_relaxed);
      if (details::circular_less_than<index_t>(
              this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
                  overcommit,
              tail)) {
        std::atomic_thread_fence(std::memory_order_acquire);

        index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(
            1, std::memory_order_relaxed);
        tail = this->tailIndex.load(std::memory_order_acquire);
        if ((details::likely)(details::circular_less_than<index_t>(
                myDequeueCount - overcommit, tail))) {
          index_t index =
              this->headIndex.fetch_add(1, std::memory_order_acq_rel);

          // Determine which block the element is in
          auto entry = get_block_index_entry_for_index(index);

          // Dequeue
          auto block = entry->value.load(std::memory_order_relaxed);
          auto& el = *((*block)[index]);

          if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
            // Note: Acquiring the mutex with every dequeue instead of only when
            // a block is released is very sub-optimal, but it is, after all,
            // purely debug code.
            debug::DebugLock lock(producer->mutex);
#endif
            struct Guard {
              Block* block;
              index_t index;
              BlockIndexEntry* entry;
              ConcurrentQueue* parent;

              ~Guard() {
                (*block)[index]->~T();
                if (block->ConcurrentQueue::Block::template set_empty<
                        implicit_context>(index)) {
                  entry->value.store(nullptr, std::memory_order_relaxed);
                  parent->add_block_to_free_list(block);
                }
              }
            } guard = {block, index, entry, this->parent};

            element = std::move(el);  // NOLINT
          }
          else {
            element = std::move(el);  // NOLINT
            el.~T();                  // NOLINT

            if (block->ConcurrentQueue::Block::template set_empty<
                    implicit_context>(index)) {
              {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
                debug::DebugLock lock(mutex);
#endif
                // Add the block back into the global free pool (and remove from
                // block index)
                entry->value.store(nullptr, std::memory_order_relaxed);
              }
              this->parent->add_block_to_free_list(
                  block);  // releases the above store
            }
          }

          return true;
        }
        else {
          this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
        }
      }

      return false;
    }

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4706)  // assignment within conditional expression
#endif
    template <AllocationMode allocMode, typename It>
    bool enqueue_bulk(It itemFirst, size_t count) {
      // First, we need to make sure we have enough room to enqueue all of the
      // elements; this means pre-allocating blocks and putting them in the
      // block index (but only if all the allocations succeeded).

      // Note that the tailBlock we start off with may not be owned by us any
      // more; this happens if it was filled up exactly to the top (setting
      // tailIndex to the first index of the next block which is not yet
      // allocated), then dequeued completely (putting it on the free list)
      // before we enqueue again.

      index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
      auto startBlock = this->tailBlock;
      Block* firstAllocatedBlock = nullptr;
      auto endBlock = this->tailBlock;

      // Figure out how many blocks we'll need to allocate, and do so
      size_t blockBaseDiff =
          ((startTailIndex + count - 1) &
           ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) -
          ((startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
      index_t currentTailIndex =
          (startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
      if (blockBaseDiff > 0) {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
        debug::DebugLock lock(mutex);
#endif
        do {
          blockBaseDiff -= static_cast<index_t>(QUEUE_BLOCK_SIZE);
          currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE);

          // Find out where we'll be inserting this block in the block index
          BlockIndexEntry* idxEntry =
              nullptr;  // initialization here unnecessary but compiler can't
                        // always tell
          Block* newBlock;
          bool indexInserted = false;
          auto head = this->headIndex.load(std::memory_order_relaxed);
          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
          bool full =
              !details::circular_less_than<index_t>(
                  head, currentTailIndex + QUEUE_BLOCK_SIZE) ||
              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
               (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE <
                                              currentTailIndex - head));

          if (full ||
              !(indexInserted = insert_block_index_entry<allocMode>(
                    idxEntry, currentTailIndex)) ||
              (newBlock =
                   this->parent->ConcurrentQueue::template requisition_block<
                       allocMode>()) == nullptr) {
            // Index allocation or block allocation failed; revert any other
            // allocations and index insertions done so far for this operation
            if (indexInserted) {
              rewind_block_index_tail();
              idxEntry->value.store(nullptr, std::memory_order_relaxed);
            }
            currentTailIndex = (startTailIndex - 1) &
                               ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
            for (auto block = firstAllocatedBlock; block != nullptr;
                 block = block->next) {
              currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE);
              idxEntry = get_block_index_entry_for_index(currentTailIndex);
              idxEntry->value.store(nullptr, std::memory_order_relaxed);
              rewind_block_index_tail();
            }
            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
            this->tailBlock = startBlock;

            return false;
          }

#ifdef MCDBGQ_TRACKMEM
          newBlock->owner = this;
#endif
          newBlock->ConcurrentQueue::Block::template reset_empty<
              implicit_context>();
          newBlock->next = nullptr;

          // Insert the new block into the index
          idxEntry->value.store(newBlock, std::memory_order_relaxed);

          // Store the chain of blocks so that we can undo if later allocations
          // fail, and so that we can find the blocks when we do the actual
          // enqueueing
          if ((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) !=
                  0 ||
              firstAllocatedBlock != nullptr) {
            assert(this->tailBlock != nullptr);
            this->tailBlock->next = newBlock;
          }
          this->tailBlock = newBlock;
          endBlock = newBlock;
          firstAllocatedBlock =
              firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
        } while (blockBaseDiff > 0);
      }

      // Enqueue, one block at a time
      index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
      currentTailIndex = startTailIndex;
      this->tailBlock = startBlock;
      assert((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) !=
                 0 ||
             firstAllocatedBlock != nullptr || count == 0);
      if ((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0 &&
          firstAllocatedBlock != nullptr) {
        this->tailBlock = firstAllocatedBlock;
      }
      while (true) {
        index_t stopIndex =
            (currentTailIndex & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
            static_cast<index_t>(QUEUE_BLOCK_SIZE);
        if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
          stopIndex = newTailIndex;
        }
        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
            T, decltype(*itemFirst),
            new (static_cast<T*>(nullptr))
                T(details::deref_noexcept(itemFirst)))) {
          while (currentTailIndex != stopIndex) {
            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
          }
        }
        else {
          MOODYCAMEL_TRY {
            while (currentTailIndex != stopIndex) {
              new ((*this->tailBlock)[currentTailIndex]) T(
                  details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
                      T, decltype(*itemFirst),
                      new (static_cast<T*>(nullptr)) T(details::deref_noexcept(
                          itemFirst)))>::eval(*itemFirst));
              ++currentTailIndex;
              ++itemFirst;
            }
          }
          MOODYCAMEL_CATCH(...) {
            auto constructedStopIndex = currentTailIndex;
            auto lastBlockEnqueued = this->tailBlock;

            if (!details::is_trivially_destructible<T>::value) {
              auto block = startBlock;
              if ((startTailIndex &
                   static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0) {
                block = firstAllocatedBlock;
              }
              currentTailIndex = startTailIndex;
              while (true) {
                stopIndex = (currentTailIndex &
                             ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
                            static_cast<index_t>(QUEUE_BLOCK_SIZE);
                if (details::circular_less_than<index_t>(constructedStopIndex,
                                                         stopIndex)) {
                  stopIndex = constructedStopIndex;
                }
                while (currentTailIndex != stopIndex) {
                  (*block)[currentTailIndex++]->~T();
                }
                if (block == lastBlockEnqueued) {
                  break;
                }
                block = block->next;
              }
            }

            currentTailIndex = (startTailIndex - 1) &
                               ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
            for (auto block = firstAllocatedBlock; block != nullptr;
                 block = block->next) {
              currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE);
              auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
              idxEntry->value.store(nullptr, std::memory_order_relaxed);
              rewind_block_index_tail();
            }
            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
            this->tailBlock = startBlock;
            MOODYCAMEL_RETHROW;
          }
        }

        if (this->tailBlock == endBlock) {
          assert(currentTailIndex == newTailIndex);
          break;
        }
        this->tailBlock = this->tailBlock->next;
      }
      this->tailIndex.store(newTailIndex, std::memory_order_release);
      return true;
    }
#ifdef _MSC_VER
#pragma warning(pop)
#endif

    template <typename It>
    size_t dequeue_bulk(It& itemFirst, size_t max) {
      auto tail = this->tailIndex.load(std::memory_order_relaxed);
      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
      auto desiredCount = static_cast<size_t>(
          tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
                  overcommit));
      if (details::circular_less_than<size_t>(0, desiredCount)) {
        desiredCount = desiredCount < max ? desiredCount : max;
        std::atomic_thread_fence(std::memory_order_acquire);

        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
            desiredCount, std::memory_order_relaxed);

        tail = this->tailIndex.load(std::memory_order_acquire);
        auto actualCount =
            static_cast<size_t>(tail - (myDequeueCount - overcommit));
        if (details::circular_less_than<size_t>(0, actualCount)) {
          actualCount = desiredCount < actualCount ? desiredCount : actualCount;
          if (actualCount < desiredCount) {
            this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
                                              std::memory_order_release);
          }

          // Get the first index. Note that since there's guaranteed to be at
          // least actualCount elements, this will never exceed tail.
          auto firstIndex =
              this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);

          // Iterate the blocks and dequeue
          auto index = firstIndex;
          BlockIndexHeader* localBlockIndex;
          auto indexIndex =
              get_block_index_index_for_index(index, localBlockIndex);
          do {
            auto blockStartIndex = index;
            index_t endIndex =
                (index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
                static_cast<index_t>(QUEUE_BLOCK_SIZE);
            endIndex =
                details::circular_less_than<index_t>(
                    firstIndex + static_cast<index_t>(actualCount), endIndex)
                    ? firstIndex + static_cast<index_t>(actualCount)
                    : endIndex;

            auto entry = localBlockIndex->index[indexIndex];
            auto block = entry->value.load(std::memory_order_relaxed);
            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&,
                                           details::deref_noexcept(itemFirst) =
                                               std::move((*(*block)[index])))) {
              while (index != endIndex) {
                auto& el = *((*block)[index]);
                *itemFirst++ = std::move(el);
                el.~T();
                ++index;
              }
            }
            else {
              MOODYCAMEL_TRY {
                while (index != endIndex) {
                  auto& el = *((*block)[index]);
                  *itemFirst = std::move(el);
                  ++itemFirst;
                  el.~T();
                  ++index;
                }
              }
              MOODYCAMEL_CATCH(...) {
                do {
                  entry = localBlockIndex->index[indexIndex];
                  block = entry->value.load(std::memory_order_relaxed);
                  while (index != endIndex) {
                    (*block)[index++]->~T();
                  }

                  if (block->ConcurrentQueue::Block::template set_many_empty<
                          implicit_context>(
                          blockStartIndex,
                          static_cast<size_t>(endIndex - blockStartIndex))) {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
                    debug::DebugLock lock(mutex);
#endif
                    entry->value.store(nullptr, std::memory_order_relaxed);
                    this->parent->add_block_to_free_list(block);
                  }
                  indexIndex =
                      (indexIndex + 1) & (localBlockIndex->capacity - 1);

                  blockStartIndex = index;
                  endIndex =
                      (index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
                      static_cast<index_t>(QUEUE_BLOCK_SIZE);
                  endIndex =
                      details::circular_less_than<index_t>(
                          firstIndex + static_cast<index_t>(actualCount),
                          endIndex)
                          ? firstIndex + static_cast<index_t>(actualCount)
                          : endIndex;
                } while (index != firstIndex + actualCount);

                MOODYCAMEL_RETHROW;
              }
            }
            if (block->ConcurrentQueue::Block::template set_many_empty<
                    implicit_context>(
                    blockStartIndex,
                    static_cast<size_t>(endIndex - blockStartIndex))) {
              {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
                debug::DebugLock lock(mutex);
#endif
                // Note that the set_many_empty above did a release, meaning
                // that anybody who acquires the block we're about to free can
                // use it safely since our writes (and reads!) will have
                // happened-before then.
                entry->value.store(nullptr, std::memory_order_relaxed);
              }
              this->parent->add_block_to_free_list(
                  block);  // releases the above store
            }
            indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
          } while (index != firstIndex + actualCount);

          return actualCount;
        }
        else {
          this->dequeueOvercommit.fetch_add(desiredCount,
                                            std::memory_order_release);
        }
      }

      return 0;
    }

   private:
    // The block size must be > 1, so any number with the low bit set is an
    // invalid block base index
    static const index_t INVALID_BLOCK_BASE = 1;

    struct BlockIndexEntry {
      std::atomic<index_t> key;
      std::atomic<Block*> value;
    };

    struct BlockIndexHeader {
      size_t capacity;
      std::atomic<size_t> tail;
      BlockIndexEntry* entries;
      BlockIndexEntry** index;
      BlockIndexHeader* prev;
    };

    template <AllocationMode allocMode>
    inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry,
                                         index_t blockStartIndex) {
      auto localBlockIndex =
          blockIndex.load(std::memory_order_relaxed);  // We're the only writer
                                                       // thread, relaxed is OK
      if (localBlockIndex == nullptr) {
        return false;  // this can happen if new_block_index failed in the
                       // constructor
      }
      size_t newTail =
          (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
          (localBlockIndex->capacity - 1);
      idxEntry = localBlockIndex->index[newTail];
      if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
          idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
        idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
        localBlockIndex->tail.store(newTail, std::memory_order_release);
        return true;
      }

      // No room in the old block index, try to allocate another one!
      MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { return false; }
      else if (!new_block_index()) {
        return false;
      }
      else {
        localBlockIndex = blockIndex.load(std::memory_order_relaxed);
        newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
                  (localBlockIndex->capacity - 1);
        idxEntry = localBlockIndex->index[newTail];
        assert(idxEntry->key.load(std::memory_order_relaxed) ==
               INVALID_BLOCK_BASE);
        idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
        localBlockIndex->tail.store(newTail, std::memory_order_release);
        return true;
      }
    }

    inline void rewind_block_index_tail() {
      auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
      localBlockIndex->tail.store(
          (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) &
              (localBlockIndex->capacity - 1),
          std::memory_order_relaxed);
    }

    inline BlockIndexEntry* get_block_index_entry_for_index(
        index_t index) const {
      BlockIndexHeader* localBlockIndex;
      auto idx = get_block_index_index_for_index(index, localBlockIndex);
      return localBlockIndex->index[idx];
    }

    inline size_t get_block_index_index_for_index(
        index_t index, BlockIndexHeader*& localBlockIndex) const {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
      debug::DebugLock lock(mutex);
#endif
      index &= ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
      localBlockIndex = blockIndex.load(std::memory_order_acquire);
      auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
      auto tailBase =
          localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
      assert(tailBase != INVALID_BLOCK_BASE);
      // Note: Must use division instead of shift because the index may wrap
      // around, causing a negative offset, whose negativity we want to preserve
      auto offset = static_cast<size_t>(
          static_cast<typename std::make_signed<index_t>::type>(index -
                                                                tailBase) /
          static_cast<typename std::make_signed<index_t>::type>(
              QUEUE_BLOCK_SIZE));
      size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
      assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) ==
                 index &&
             localBlockIndex->index[idx]->value.load(
                 std::memory_order_relaxed) != nullptr);
      return idx;
    }

    bool new_block_index() {
      auto prev = blockIndex.load(std::memory_order_relaxed);
      size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
      auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
      auto raw = static_cast<char*>((Traits::malloc)(
          sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
          1 + sizeof(BlockIndexEntry) * entryCount +
          std::alignment_of<BlockIndexEntry*>::value - 1 +
          sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
      if (raw == nullptr) {
        return false;
      }

      auto header = new (raw) BlockIndexHeader;
      auto entries = reinterpret_cast<BlockIndexEntry*>(
          details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
      auto index = reinterpret_cast<BlockIndexEntry**>(
          details::align_for<BlockIndexEntry*>(
              reinterpret_cast<char*>(entries) +
              sizeof(BlockIndexEntry) * entryCount));
      if (prev != nullptr) {
        auto prevTail = prev->tail.load(std::memory_order_relaxed);
        auto prevPos = prevTail;
        size_t i = 0;
        do {
          prevPos = (prevPos + 1) & (prev->capacity - 1);
          index[i++] = prev->index[prevPos];
        } while (prevPos != prevTail);
        assert(i == prevCapacity);
      }
      for (size_t i = 0; i != entryCount; ++i) {
        new (entries + i) BlockIndexEntry;
        entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
        index[prevCapacity + i] = entries + i;
      }
      header->prev = prev;
      header->entries = entries;
      header->index = index;
      header->capacity = nextBlockIndexCapacity;
      header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1),
                         std::memory_order_relaxed);

      blockIndex.store(header, std::memory_order_release);

      nextBlockIndexCapacity <<= 1;

      return true;
    }

   private:
    size_t nextBlockIndexCapacity;
    std::atomic<BlockIndexHeader*> blockIndex;

#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
   public:
    details::ThreadExitListener threadExitListener;

   private:
#endif

#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
   public:
    ImplicitProducer* nextImplicitProducer;

   private:
#endif

#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
    mutable debug::DebugMutex mutex;
#endif
#ifdef MCDBGQ_TRACKMEM
    friend struct MemStats;
#endif
  };

  //////////////////////////////////
  // Block pool manipulation
  //////////////////////////////////

  void populate_initial_block_list(size_t blockCount) {
    initialBlockPoolSize = blockCount;
    if (initialBlockPoolSize == 0) {
      initialBlockPool = nullptr;
      return;
    }

    initialBlockPool = create_array<Block>(blockCount);
    if (initialBlockPool == nullptr) {
      initialBlockPoolSize = 0;
    }
    for (size_t i = 0; i < initialBlockPoolSize; ++i) {
      initialBlockPool[i].dynamicallyAllocated = false;
    }
  }

  inline Block* try_get_block_from_initial_pool() {
    if (initialBlockPoolIndex.load(std::memory_order_relaxed) >=
        initialBlockPoolSize) {
      return nullptr;
    }

    auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);

    return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
  }

  inline void add_block_to_free_list(Block* block) {
#ifdef MCDBGQ_TRACKMEM
    block->owner = nullptr;
#endif
    if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
      destroy(block);
    }
    else {
      freeList.add(block);
    }
  }

  inline void add_blocks_to_free_list(Block* block) {
    while (block != nullptr) {
      auto next = block->next;
      add_block_to_free_list(block);
      block = next;
    }
  }

  inline Block* try_get_block_from_free_list() { return freeList.try_get(); }

  // Gets a free block from one of the memory pools, or allocates a new one (if
  // applicable)
  template <AllocationMode canAlloc>
  Block* requisition_block() {
    auto block = try_get_block_from_initial_pool();
    if (block != nullptr) {
      return block;
    }

    block = try_get_block_from_free_list();
    if (block != nullptr) {
      return block;
    }

    MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) { return create<Block>(); }
    else {
      return nullptr;
    }
  }

#ifdef MCDBGQ_TRACKMEM
 public:
  struct MemStats {
    size_t allocatedBlocks;
    size_t usedBlocks;
    size_t freeBlocks;
    size_t ownedBlocksExplicit;
    size_t ownedBlocksImplicit;
    size_t implicitProducers;
    size_t explicitProducers;
    size_t elementsEnqueued;
    size_t blockClassBytes;
    size_t queueClassBytes;
    size_t implicitBlockIndexBytes;
    size_t explicitBlockIndexBytes;

    friend class ConcurrentQueue;

   private:
    static MemStats getFor(ConcurrentQueue* q) {
      MemStats stats = {0};

      stats.elementsEnqueued = q->size_approx();

      auto block = q->freeList.head_unsafe();
      while (block != nullptr) {
        ++stats.allocatedBlocks;
        ++stats.freeBlocks;
        block = block->freeListNext.load(std::memory_order_relaxed);
      }

      for (auto ptr = q->producerListTail.load(std::memory_order_acquire);
           ptr != nullptr; ptr = ptr->next_prod()) {
        bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
        stats.implicitProducers += implicit ? 1 : 0;
        stats.explicitProducers += implicit ? 0 : 1;

        if (implicit) {
          auto prod = static_cast<ImplicitProducer*>(ptr);
          stats.queueClassBytes += sizeof(ImplicitProducer);
          auto head = prod->headIndex.load(std::memory_order_relaxed);
          auto tail = prod->tailIndex.load(std::memory_order_relaxed);
          auto hash = prod->blockIndex.load(std::memory_order_relaxed);
          if (hash != nullptr) {
            for (size_t i = 0; i != hash->capacity; ++i) {
              if (hash->index[i]->key.load(std::memory_order_relaxed) !=
                      ImplicitProducer::INVALID_BLOCK_BASE &&
                  hash->index[i]->value.load(std::memory_order_relaxed) !=
                      nullptr) {
                ++stats.allocatedBlocks;
                ++stats.ownedBlocksImplicit;
              }
            }
            stats.implicitBlockIndexBytes +=
                hash->capacity *
                sizeof(typename ImplicitProducer::BlockIndexEntry);
            for (; hash != nullptr; hash = hash->prev) {
              stats.implicitBlockIndexBytes +=
                  sizeof(typename ImplicitProducer::BlockIndexHeader) +
                  hash->capacity *
                      sizeof(typename ImplicitProducer::BlockIndexEntry*);
            }
          }
          for (; details::circular_less_than<index_t>(head, tail);
               head += QUEUE_BLOCK_SIZE) {
            // auto block = prod->get_block_index_entry_for_index(head);
            ++stats.usedBlocks;
          }
        }
        else {
          auto prod = static_cast<ExplicitProducer*>(ptr);
          stats.queueClassBytes += sizeof(ExplicitProducer);
          auto tailBlock = prod->tailBlock;
          bool wasNonEmpty = false;
          if (tailBlock != nullptr) {
            auto block = tailBlock;
            do {
              ++stats.allocatedBlocks;
              if (!block->ConcurrentQueue::Block::template is_empty<
                      explicit_context>() ||
                  wasNonEmpty) {
                ++stats.usedBlocks;
                wasNonEmpty = wasNonEmpty || block != tailBlock;
              }
              ++stats.ownedBlocksExplicit;
              block = block->next;
            } while (block != tailBlock);
          }
          auto index = prod->blockIndex.load(std::memory_order_relaxed);
          while (index != nullptr) {
            stats.explicitBlockIndexBytes +=
                sizeof(typename ExplicitProducer::BlockIndexHeader) +
                index->size *
                    sizeof(typename ExplicitProducer::BlockIndexEntry);
            index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(
                index->prev);
          }
        }
      }

      auto freeOnInitialPool =
          q->initialBlockPoolIndex.load(std::memory_order_relaxed) >=
                  q->initialBlockPoolSize
              ? 0
              : q->initialBlockPoolSize -
                    q->initialBlockPoolIndex.load(std::memory_order_relaxed);
      stats.allocatedBlocks += freeOnInitialPool;
      stats.freeBlocks += freeOnInitialPool;

      stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
      stats.queueClassBytes += sizeof(ConcurrentQueue);

      return stats;
    }
  };

  // For debugging only. Not thread-safe.
  MemStats getMemStats() { return MemStats::getFor(this); }

 private:
  friend struct MemStats;
#endif

  //////////////////////////////////
  // Producer list manipulation
  //////////////////////////////////

  ProducerBase* recycle_or_create_producer(bool isExplicit) {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
    debug::DebugLock lock(implicitProdMutex);
#endif
    // Try to re-use one first
    for (auto ptr = producerListTail.load(std::memory_order_acquire);
         ptr != nullptr; ptr = ptr->next_prod()) {
      if (ptr->inactive.load(std::memory_order_relaxed) &&
          ptr->isExplicit == isExplicit) {
        bool expected = true;
        if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false,
                                                  std::memory_order_acquire,
                                                  std::memory_order_relaxed)) {
          // We caught one! It's been marked as activated, the caller can have
          // it
          return ptr;
        }
      }
    }

    return add_producer(
        isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this))
                   : create<ImplicitProducer>(this));
  }

  ProducerBase* add_producer(ProducerBase* producer) {
    // Handle failed memory allocation
    if (producer == nullptr) {
      return nullptr;
    }

    producerCount.fetch_add(1, std::memory_order_relaxed);

    // Add it to the lock-free list
    auto prevTail = producerListTail.load(std::memory_order_relaxed);
    do {
      producer->next = prevTail;
    } while (!producerListTail.compare_exchange_weak(
        prevTail, producer, std::memory_order_release,
        std::memory_order_relaxed));

#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
    if (producer->isExplicit) {
      auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
      do {
        static_cast<ExplicitProducer*>(producer)->nextExplicitProducer =
            prevTailExplicit;
      } while (!explicitProducers.compare_exchange_weak(
          prevTailExplicit, static_cast<ExplicitProducer*>(producer),
          std::memory_order_release, std::memory_order_relaxed));
    }
    else {
      auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
      do {
        static_cast<ImplicitProducer*>(producer)->nextImplicitProducer =
            prevTailImplicit;
      } while (!implicitProducers.compare_exchange_weak(
          prevTailImplicit, static_cast<ImplicitProducer*>(producer),
          std::memory_order_release, std::memory_order_relaxed));
    }
#endif

    return producer;
  }

  void reown_producers() {
    // After another instance is moved-into/swapped-with this one, all the
    // producers we stole still think their parents are the other queue.
    // So fix them up!
    for (auto ptr = producerListTail.load(std::memory_order_relaxed);
         ptr != nullptr; ptr = ptr->next_prod()) {
      ptr->parent = this;
    }
  }

  //////////////////////////////////
  // Implicit producer hash
  //////////////////////////////////

  struct ImplicitProducerKVP {
    std::atomic<details::thread_id_t> key;
    ImplicitProducer* value;  // No need for atomicity since it's only read by
                              // the thread that sets it in the first place

    ImplicitProducerKVP() : value(nullptr) {}

    ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT {
      key.store(other.key.load(std::memory_order_relaxed),
                std::memory_order_relaxed);
      value = other.value;
    }

    inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other)
        MOODYCAMEL_NOEXCEPT {
      swap(other);
      return *this;
    }

    inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT {
      if (this != &other) {
        details::swap_relaxed(key, other.key);
        std::swap(value, other.value);
      }
    }
  };

  template <typename XT, typename XTraits>
  friend void moodycamel::swap(
      typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&,
      typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&)
      MOODYCAMEL_NOEXCEPT;

  struct ImplicitProducerHash {
    size_t capacity;
    ImplicitProducerKVP* entries;
    ImplicitProducerHash* prev;
  };

  inline void populate_initial_implicit_producer_hash() {
    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
      return;
    }
    else {
      implicitProducerHashCount.store(0, std::memory_order_relaxed);
      auto hash = &initialImplicitProducerHash;
      hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
      hash->entries = &initialImplicitProducerHashEntries[0];
      for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
        initialImplicitProducerHashEntries[i].key.store(
            details::invalid_thread_id, std::memory_order_relaxed);
      }
      hash->prev = nullptr;
      implicitProducerHash.store(hash, std::memory_order_relaxed);
    }
  }

  void swap_implicit_producer_hashes(ConcurrentQueue& other) {
    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
      return;
    }
    else {
      // Swap (assumes our implicit producer hash is initialized)
      initialImplicitProducerHashEntries.swap(
          other.initialImplicitProducerHashEntries);
      initialImplicitProducerHash.entries =
          &initialImplicitProducerHashEntries[0];
      other.initialImplicitProducerHash.entries =
          &other.initialImplicitProducerHashEntries[0];

      details::swap_relaxed(implicitProducerHashCount,
                            other.implicitProducerHashCount);

      details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
      if (implicitProducerHash.load(std::memory_order_relaxed) ==
          &other.initialImplicitProducerHash) {
        implicitProducerHash.store(&initialImplicitProducerHash,
                                   std::memory_order_relaxed);
      }
      else {
        ImplicitProducerHash* hash;
        for (hash = implicitProducerHash.load(std::memory_order_relaxed);
             hash->prev != &other.initialImplicitProducerHash;
             hash = hash->prev) {
          continue;
        }
        hash->prev = &initialImplicitProducerHash;
      }
      if (other.implicitProducerHash.load(std::memory_order_relaxed) ==
          &initialImplicitProducerHash) {
        other.implicitProducerHash.store(&other.initialImplicitProducerHash,
                                         std::memory_order_relaxed);
      }
      else {
        ImplicitProducerHash* hash;
        for (hash = other.implicitProducerHash.load(std::memory_order_relaxed);
             hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
          continue;
        }
        hash->prev = &other.initialImplicitProducerHash;
      }
    }
  }

  // Only fails (returns nullptr) if memory allocation fails
  ImplicitProducer* get_or_add_implicit_producer() {
    // Note that since the data is essentially thread-local (key is thread ID),
    // there's a reduced need for fences (memory ordering is already consistent
    // for any individual thread), except for the current table itself.

    // Start by looking for the thread ID in the current and all previous hash
    // tables. If it's not found, it must not be in there yet, since this same
    // thread would have added it previously to one of the tables that we
    // traversed.

    // Code and algorithm adapted from
    // http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table

#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
    debug::DebugLock lock(implicitProdMutex);
#endif

    auto id = details::thread_id();
    auto hashedId = details::hash_thread_id(id);

    auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
    assert(
        mainHash !=
        nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
    for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
      // Look for the id in this hash
      auto index = hashedId;
      while (true) {  // Not an infinite loop because at least one slot is free
                      // in the hash table
        index &= hash->capacity - 1u;

        auto probedKey =
            hash->entries[index].key.load(std::memory_order_relaxed);
        if (probedKey == id) {
          // Found it! If we had to search several hashes deep, though, we
          // should lazily add it to the current main hash table to avoid the
          // extended search next time. Note there's guaranteed to be room in
          // the current hash table since every subsequent table implicitly
          // reserves space for all previous tables (there's only one
          // implicitProducerHashCount).
          auto value = hash->entries[index].value;
          if (hash != mainHash) {
            index = hashedId;
            while (true) {
              index &= mainHash->capacity - 1u;
              auto empty = details::invalid_thread_id;
#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
              auto reusable = details::invalid_thread_id2;
              if (mainHash->entries[index].key.compare_exchange_strong(
                      empty, id, std::memory_order_seq_cst,
                      std::memory_order_relaxed) ||
                  mainHash->entries[index].key.compare_exchange_strong(
                      reusable, id, std::memory_order_seq_cst,
                      std::memory_order_relaxed)) {
#else
              if (mainHash->entries[index].key.compare_exchange_strong(
                      empty, id, std::memory_order_seq_cst,
                      std::memory_order_relaxed)) {
#endif
                mainHash->entries[index].value = value;
                break;
              }
              ++index;
            }
          }

          return value;
        }
        if (probedKey == details::invalid_thread_id) {
          break;  // Not in this hash table
        }
        ++index;
      }
    }

    // Insert!
    auto newCount =
        1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
    while (true) {
      // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
      if (newCount >= (mainHash->capacity >> 1) &&
          !implicitProducerHashResizeInProgress.test_and_set(
              std::memory_order_acquire)) {
        // We've acquired the resize lock, try to allocate a bigger hash table.
        // Note the acquire fence synchronizes with the release fence at the end
        // of this block, and hence when we reload implicitProducerHash it must
        // be the most recent version (it only gets changed within this locked
        // block).
        mainHash = implicitProducerHash.load(std::memory_order_acquire);
        if (newCount >= (mainHash->capacity >> 1)) {
          size_t newCapacity = mainHash->capacity << 1;
          while (newCount >= (newCapacity >> 1)) {
            newCapacity <<= 1;
          }
          auto raw = static_cast<char*>(
              (Traits::malloc)(sizeof(ImplicitProducerHash) +
                               std::alignment_of<ImplicitProducerKVP>::value -
                               1 + sizeof(ImplicitProducerKVP) * newCapacity));
          if (raw == nullptr) {
            // Allocation failed
            implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
            implicitProducerHashResizeInProgress.clear(
                std::memory_order_relaxed);
            return nullptr;
          }

          auto newHash = new (raw) ImplicitProducerHash;
          newHash->capacity = static_cast<size_t>(newCapacity);
          newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(
              details::align_for<ImplicitProducerKVP>(
                  raw + sizeof(ImplicitProducerHash)));
          for (size_t i = 0; i != newCapacity; ++i) {
            new (newHash->entries + i) ImplicitProducerKVP;
            newHash->entries[i].key.store(details::invalid_thread_id,
                                          std::memory_order_relaxed);
          }
          newHash->prev = mainHash;
          implicitProducerHash.store(newHash, std::memory_order_release);
          implicitProducerHashResizeInProgress.clear(std::memory_order_release);
          mainHash = newHash;
        }
        else {
          implicitProducerHashResizeInProgress.clear(std::memory_order_release);
        }
      }

      // If it's < three-quarters full, add to the old one anyway so that we
      // don't have to wait for the next table to finish being allocated by
      // another thread (and if we just finished allocating above, the condition
      // will always be true)
      if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
        auto producer =
            static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
        if (producer == nullptr) {
          implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
          return nullptr;
        }

#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
        producer->threadExitListener.callback =
            &ConcurrentQueue::implicit_producer_thread_exited_callback;
        producer->threadExitListener.userData = producer;
        details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
#endif

        auto index = hashedId;
        while (true) {
          index &= mainHash->capacity - 1u;
          auto empty = details::invalid_thread_id;
#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
          auto reusable = details::invalid_thread_id2;
          if (mainHash->entries[index].key.compare_exchange_strong(
                  reusable, id, std::memory_order_seq_cst,
                  std::memory_order_relaxed)) {
            implicitProducerHashCount.fetch_sub(
                1,
                std::memory_order_relaxed);  // already counted as a used slot
            mainHash->entries[index].value = producer;
            break;
          }
#endif
          if (mainHash->entries[index].key.compare_exchange_strong(
                  empty, id, std::memory_order_seq_cst,
                  std::memory_order_relaxed)) {
            mainHash->entries[index].value = producer;
            break;
          }
          ++index;
        }
        return producer;
      }

      // Hmm, the old hash is quite full and somebody else is busy allocating a
      // new one. We need to wait for the allocating thread to finish (if it
      // succeeds, we add, if not, we try to allocate ourselves).
      mainHash = implicitProducerHash.load(std::memory_order_acquire);
    }
  }

#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
  void implicit_producer_thread_exited(ImplicitProducer* producer) {
    // Remove from hash
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
    debug::DebugLock lock(implicitProdMutex);
#endif
    auto hash = implicitProducerHash.load(std::memory_order_acquire);
    assert(hash != nullptr);  // The thread exit listener is only registered if
                              // we were added to a hash in the first place
    auto id = details::thread_id();
    auto hashedId = details::hash_thread_id(id);
    details::thread_id_t probedKey;

    // We need to traverse all the hashes just in case other threads aren't on
    // the current one yet and are trying to add an entry thinking there's a
    // free slot (because they reused a producer)
    for (; hash != nullptr; hash = hash->prev) {
      auto index = hashedId;
      do {
        index &= hash->capacity - 1u;
        probedKey = id;
        if (hash->entries[index].key.compare_exchange_strong(
                probedKey, details::invalid_thread_id2,
                std::memory_order_seq_cst, std::memory_order_relaxed)) {
          break;
        }
        ++index;
      } while (
          probedKey !=
          details::invalid_thread_id);  // Can happen if the hash has changed
                                        // but we weren't put back in it yet, or
                                        // if we weren't added to this hash in
                                        // the first place
    }

    // Mark the queue as being recyclable
    producer->inactive.store(true, std::memory_order_release);
  }

  static void implicit_producer_thread_exited_callback(void* userData) {
    auto producer = static_cast<ImplicitProducer*>(userData);
    auto queue = producer->parent;
    queue->implicit_producer_thread_exited(producer);
  }
#endif

  //////////////////////////////////
  // Utility functions
  //////////////////////////////////

  template <typename TAlign>
  static inline void* aligned_malloc(size_t size) {
    MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
                            std::alignment_of<details::max_align_t>::value)
    return (Traits::malloc)(size);
    else {
      size_t alignment = std::alignment_of<TAlign>::value;
      void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
      if (!raw)
        return nullptr;
      char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) +
                                             sizeof(void*));
      *(reinterpret_cast<void**>(ptr) - 1) = raw;
      return ptr;
    }
  }

  template <typename TAlign>
  static inline void aligned_free(void* ptr) {
    MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
                            std::alignment_of<details::max_align_t>::value)
    return (Traits::free)(ptr);
    else(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
  }

  template <typename U>
  static inline U* create_array(size_t count) {
    assert(count > 0);
    U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
    if (p == nullptr)
      return nullptr;

    for (size_t i = 0; i != count; ++i) new (p + i) U();
    return p;
  }

  template <typename U>
  static inline void destroy_array(U* p, size_t count) {
    if (p != nullptr) {
      assert(count > 0);
      for (size_t i = count; i != 0;) (p + --i)->~U();
    }
    aligned_free<U>(p);
  }

  template <typename U>
  static inline U* create() {
    void* p = aligned_malloc<U>(sizeof(U));
    return p != nullptr ? new (p) U : nullptr;
  }

  template <typename U, typename A1>
  static inline U* create(A1&& a1) {
    void* p = aligned_malloc<U>(sizeof(U));
    return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
  }

  template <typename U>
  static inline void destroy(U* p) {
    if (p != nullptr)
      p->~U();
    aligned_free<U>(p);
  }

 private:
  std::atomic<ProducerBase*> producerListTail;
  std::atomic<std::uint32_t> producerCount;

  std::atomic<size_t> initialBlockPoolIndex;
  Block* initialBlockPool;
  size_t initialBlockPoolSize;

#ifndef MCDBGQ_USEDEBUGFREELIST
  FreeList<Block> freeList;
#else
  debug::DebugFreeList<Block> freeList;
#endif

  std::atomic<ImplicitProducerHash*> implicitProducerHash;
  std::atomic<size_t>
      implicitProducerHashCount;  // Number of slots logically used
  ImplicitProducerHash initialImplicitProducerHash;
  std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE>
      initialImplicitProducerHashEntries;
  std::atomic_flag implicitProducerHashResizeInProgress;

  std::atomic<std::uint32_t> nextExplicitConsumerId;
  std::atomic<std::uint32_t> globalExplicitConsumerOffset;

#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
  debug::DebugMutex implicitProdMutex;
#endif

#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
  std::atomic<ExplicitProducer*> explicitProducers;
  std::atomic<ImplicitProducer*> implicitProducers;
#endif
};

template <typename T, typename Traits>
ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
    : producer(queue.recycle_or_create_producer(true)) {
  if (producer != nullptr) {
    producer->token = this;
  }
}

template <typename T, typename Traits>
ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
    : producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)
                   ->recycle_or_create_producer(true)) {
  if (producer != nullptr) {
    producer->token = this;
  }
}

template <typename T, typename Traits>
ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
    : itemsConsumedFromCurrent(0),
      currentProducer(nullptr),
      desiredProducer(nullptr) {
  initialOffset =
      queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
  lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
}

template <typename T, typename Traits>
ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
    : itemsConsumedFromCurrent(0),
      currentProducer(nullptr),
      desiredProducer(nullptr) {
  initialOffset =
      reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)
          ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
  lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
}

template <typename T, typename Traits>
inline void swap(ConcurrentQueue<T, Traits>& a,
                 ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT {
  a.swap(b);
}

inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT {
  a.swap(b);
}

inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT {
  a.swap(b);
}

template <typename T, typename Traits>
inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b)
    MOODYCAMEL_NOEXCEPT {
  a.swap(b);
}

}  // namespace ylt::detail::moodycamel

#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
#pragma warning(pop)
#endif

#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
#pragma GCC diagnostic pop
#endif